In [ ]:
# ============================
# Title: Multi-Classifier Modeling, Hyperparameter Tuning & Evaluation
# Author: Siyang Ni
# Date: [Date]
# Notes: This script showcases a comprehensive pipeline for loading data,
# preprocessing, model training, hyperparameter tuning, and evaluation
# across multiple algorithms: RandomForest, GradientBoosting,
# HistGradientBoosting, XGBoost, and CatBoost. Includes interpretability
# with SHAP, partial dependence plots, and feature importances.
# ============================
Setting Up¶
In [ ]:
# ================
# 1. IMPORTS
# ================
# !pip install --upgrade pandas numpy matplotlib seaborn joblib scikit-learn xgboost catboost shap optuna
# pip install --upgrade ipywidgets
import os
import logging
import warnings
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from sklearn.model_selection import (
train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV, RepeatedStratifiedKFold
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
confusion_matrix, classification_report, roc_auc_score, roc_curve
)
from sklearn.ensemble import (
RandomForestClassifier, GradientBoostingClassifier,
HistGradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
# Interpretability
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
import shap
# Optimization
import optuna
In [ ]:
# ================
# 2. CONFIGURATION
# ================
RANDOM_STATE = 42
TEST_SIZE = 0.2
N_SPLITS_CV = 5
SCORING_METRIC = 'roc_auc'
VERBOSE = 1
CPU_COUNT = os.cpu_count()
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
In [ ]:
# ================
# 3. HELPER FUNCTIONS
# ================
def load_data(filepath: str) -> pd.DataFrame:
"""
Loads data from a CSV file into a Pandas DataFrame.
Parameters
----------
filepath : str
Full path to the CSV file.
Returns
-------
pd.DataFrame or None
Loaded DataFrame if successful, None if file not found.
"""
try:
df = pd.read_csv(os.path.expanduser(filepath))
logging.info("Data loaded successfully.")
return df
except FileNotFoundError:
logging.error(f"File not found at {filepath}")
return None
def identify_categorical_columns(df: pd.DataFrame) -> list:
"""
Identifies columns of type object or category in a DataFrame.
"""
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
logging.info(f"Identified categorical columns: {categorical_cols}")
return categorical_cols
def convert_to_categorical(df: pd.DataFrame, columns: list) -> None:
"""
Converts specified columns in a DataFrame to categorical type in-place.
"""
for col in columns:
if col in df.columns:
df[col] = df[col].astype('category')
else:
logging.warning(f"Column '{col}' not found in DataFrame.")
logging.info("Categorical conversion complete.")
def create_train_test_split(
X: pd.DataFrame,
y: pd.Series,
test_size: float = TEST_SIZE,
random_state: int = RANDOM_STATE
) -> tuple:
"""
Splits data into training and testing sets.
"""
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state,
stratify=y, shuffle=True
)
logging.info(f"Training set shape: {X_train.shape}")
logging.info(f"Testing set shape: {X_test.shape}")
return X_train, X_test, y_train, y_test
def create_missing_indicators(
X_train: pd.DataFrame,
X_test: pd.DataFrame
) -> tuple:
"""
Creates binary indicators for missing values in features.
"""
missing_indicator = MissingIndicator(features='all')
missing_indicator.fit(X_train)
X_train_flags = missing_indicator.transform(X_train)
X_test_flags = missing_indicator.transform(X_test)
missing_columns = [f'missing_{col}' for col in X_train.columns]
X_train_with_indicators = pd.concat(
[X_train.reset_index(drop=True),
pd.DataFrame(X_train_flags, columns=missing_columns)],
axis=1
)
X_test_with_indicators = pd.concat(
[X_test.reset_index(drop=True),
pd.DataFrame(X_test_flags, columns=missing_columns)],
axis=1
)
logging.info("Missing indicators created.")
return X_train_with_indicators, X_test_with_indicators
def create_preprocessor(categorical_features: list) -> ColumnTransformer:
"""
Creates a preprocessor for categorical features using OneHotEncoder
while passing other columns through without transformation.
"""
preprocessor = ColumnTransformer(
transformers=[
(
'cat',
OneHotEncoder(
drop='first',
handle_unknown='ignore'
),
categorical_features
)
],
remainder='passthrough'
)
return preprocessor
def train_evaluate_model(
model,
X_train: pd.DataFrame,
y_train: pd.Series,
X_test: pd.DataFrame,
y_test: pd.Series,
model_name: str = "Model",
save_path: str = None
):
"""
Trains, evaluates, and optionally saves a model.
Prints confusion matrix, classification report, and ROC AUC.
Plots the ROC curve.
"""
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
logging.info(f"=== {model_name} Evaluation ===")
logging.info("Confusion Matrix:\n" + str(confusion_matrix(y_test, y_pred)))
logging.info("\nClassification Report:\n" + str(classification_report(y_test, y_pred)))
logging.info(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_pred_proba):.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'{model_name} ROC Curve on Test Data')
plt.legend(loc='lower right')
plt.show()
if save_path:
joblib.dump(model, save_path)
logging.info(f"{model_name} saved to '{save_path}'.")
return model
def perform_grid_search(
pipeline: Pipeline,
param_grid: dict,
X_train: pd.DataFrame,
y_train: pd.Series,
cv=None,
scoring: str = 'roc_auc',
n_jobs: int = -1,
verbose: int = 1
):
"""
Performs GridSearchCV for hyperparameter tuning on a pipeline.
"""
if cv is None:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
cv=cv,
scoring=scoring,
n_jobs=n_jobs,
verbose=verbose
)
grid_search.fit(X_train, y_train)
logging.info("Best parameters found: " + str(grid_search.best_params_))
logging.info(f"Best cross-validation {scoring}: {grid_search.best_score_:.4f}")
return grid_search.best_estimator_
def plot_feature_importance(
model,
feature_names: list,
top_n: int = 20,
title: str = "Feature Importance"
):
"""
Plots the top N feature importances from a trained model.
"""
if hasattr(model, 'feature_importances_'):
importances = model.feature_importances_
elif hasattr(model, 'named_steps') and 'classifier' in model.named_steps:
if hasattr(model.named_steps['classifier'], 'feature_importances_'):
importances = model.named_steps['classifier'].feature_importances_
else:
raise ValueError("Classifier does not have feature_importances_ attribute.")
else:
raise ValueError("Provided model does not have feature_importances_ attribute.")
fi_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
fi_df = fi_df.sort_values('Importance', ascending=False).head(top_n)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=fi_df)
plt.title(title)
plt.tight_layout()
plt.show()
def aggregate_feature_importance(importances: np.ndarray, encoded_feature_names: list) -> pd.DataFrame:
"""
Aggregates feature importance of one-hot-encoded features back to original feature names.
"""
original_features = list(set([feat.split('_')[0] for feat in encoded_feature_names]))
original_feature_importance = {feature: 0 for feature in original_features}
for i, encoded_feature in enumerate(encoded_feature_names):
base_feature = encoded_feature.split('_')[0]
original_feature_importance[base_feature] += importances[i]
importance_df = pd.DataFrame(
list(original_feature_importance.items()),
columns=['Feature', 'Importance']
)
importance_df = importance_df.sort_values('Importance', ascending=False)
return importance_df
def plot_aggregated_feature_importance(
importance_df: pd.DataFrame,
top_n: int = 20,
title: str = "Aggregated Feature Importance"
):
"""
Plots aggregated feature importances after grouping by base feature.
"""
top_n_df = importance_df.head(top_n).sort_values(by='Importance', ascending=True)
plt.figure(figsize=(10, 6))
plt.barh(y=top_n_df['Feature'], width=top_n_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title(title)
plt.tight_layout()
plt.show()
Preprocessing¶
In [ ]:
# --- Data Loading ---
data_filepath = os.path.expanduser('~/work/vaping_project_data/processed_data_g12nn.csv')
new_data = load_data(data_filepath)
if new_data is None:
logging.error("Data loading failed. Exiting script.")
raise SystemExit
logging.info("Dataset Info:")
new_data.info()
2025-02-15 11:02:48,145 - INFO - Data loaded successfully. 2025-02-15 11:02:48,146 - INFO - Dataset Info:
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32730 entries, 0 to 32729 Data columns (total 51 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 V2178 31840 non-null float64 1 V2188 31432 non-null float64 2 V2197 31034 non-null float64 3 V2184 30687 non-null float64 4 V2186 31432 non-null float64 5 V2171 32096 non-null float64 6 V2128 32426 non-null float64 7 V2201 31005 non-null float64 8 V2173 31855 non-null float64 9 V2194 31412 non-null float64 10 V2166 31888 non-null float64 11 wave 32730 non-null int64 12 V2176 30613 non-null float64 13 V2175 31149 non-null float64 14 V2177 31046 non-null float64 15 nicotine12d 32730 non-null float64 16 V2116 32035 non-null float64 17 V2125 32302 non-null float64 18 V2182 30414 non-null float64 19 sex 32730 non-null float64 20 race 32730 non-null float64 21 V2460 31446 non-null float64 22 RESPONDENT_AGE 32665 non-null float64 23 V2185 31432 non-null float64 24 V2193 29539 non-null float64 25 V2163 32328 non-null float64 26 V49 32516 non-null float64 27 V2108 31410 non-null float64 28 V2101 32452 non-null float64 29 V2180 30589 non-null float64 30 V2164 32315 non-null float64 31 V2191 31502 non-null float64 32 V2195 31287 non-null float64 33 V2155 32448 non-null float64 34 V2196 31293 non-null float64 35 V2189 31432 non-null float64 36 V2179 31819 non-null float64 37 V13 32730 non-null int64 38 V2143 32494 non-null float64 39 V2134 32450 non-null float64 40 V2172 31932 non-null float64 41 V2137 32397 non-null float64 42 V2140 32545 non-null float64 43 V2105 31702 non-null float64 44 V2157 32448 non-null float64 45 V2183 31191 non-null float64 46 V2187 31432 non-null float64 47 V2181 30447 non-null float64 48 V2152 32636 non-null float64 49 V2153 32451 non-null float64 50 V2156 32448 non-null float64 dtypes: float64(49), int64(2) memory usage: 12.7 MB
In [ ]:
# -----------------------------------------------------------------------------
# Missing Data Analysis
# -----------------------------------------------------------------------------
total_missing = new_data.isna().sum().sum()
print("\nTotal missing values:", total_missing)
# Count negative values in numeric columns.
numeric_cols = new_data.select_dtypes(include=[np.number]).columns
negative_counts = new_data[numeric_cols].apply(lambda x: (x < 0).sum())
negative_counts_df = pd.DataFrame({
'Column': negative_counts.index,
'Negative_Count': negative_counts.values
})
print("\nNegative value counts by numeric column:")
print(negative_counts_df)
# Replace negative codes (-9, -8) with NaN.
missing_codes = [-9, -8]
new_data[numeric_cols] = new_data[numeric_cols].replace({-9: np.nan, -8: np.nan})
# Compute missing counts and percentages.
missing_counts = new_data.isna().sum()
missing_percent = (new_data.isna().mean() * 100).round(2)
missing_summary = pd.DataFrame({
'Missing_Count': missing_counts,
'Missing_Percentage': missing_percent
}).sort_values(by='Missing_Percentage', ascending=False)
print("\nMissing values summary:")
print(missing_summary.to_string())
Total missing values: 47868
Negative value counts by numeric column:
Column Negative_Count
0 V2178 0
1 V2188 0
2 V2197 0
3 V2184 0
4 V2186 0
5 V2171 0
6 V2128 0
7 V2201 0
8 V2173 0
9 V2194 0
10 V2166 0
11 wave 0
12 V2176 0
13 V2175 0
14 V2177 0
15 nicotine12d 0
16 V2116 0
17 V2125 0
18 V2182 0
19 sex 0
20 race 0
21 V2460 0
22 RESPONDENT_AGE 0
23 V2185 0
24 V2193 0
25 V2163 0
26 V49 0
27 V2108 0
28 V2101 0
29 V2180 0
30 V2164 0
31 V2191 0
32 V2195 0
33 V2155 0
34 V2196 0
35 V2189 0
36 V2179 0
37 V13 0
38 V2143 0
39 V2134 0
40 V2172 0
41 V2137 0
42 V2140 0
43 V2105 0
44 V2157 0
45 V2183 0
46 V2187 0
47 V2181 0
48 V2152 0
49 V2153 0
50 V2156 0
Missing values summary:
Missing_Count Missing_Percentage
V2193 3191 9.75
V2182 2316 7.08
V2181 2283 6.98
V2180 2141 6.54
V2176 2117 6.47
V2184 2043 6.24
V2201 1725 5.27
V2197 1696 5.18
V2177 1684 5.15
V2175 1581 4.83
V2183 1539 4.70
V2195 1443 4.41
V2196 1437 4.39
V2108 1320 4.03
V2194 1318 4.03
V2189 1298 3.97
V2188 1298 3.97
V2185 1298 3.97
V2187 1298 3.97
V2186 1298 3.97
V2460 1284 3.92
V2191 1228 3.75
V2105 1028 3.14
V2179 911 2.78
V2178 890 2.72
V2173 875 2.67
V2166 842 2.57
V2172 798 2.44
V2116 695 2.12
V2171 634 1.94
V2125 428 1.31
V2164 415 1.27
V2163 402 1.23
V2137 333 1.02
V2128 304 0.93
V2155 282 0.86
V2134 280 0.86
V2157 282 0.86
V2156 282 0.86
V2101 278 0.85
V2153 279 0.85
V2143 236 0.72
V49 214 0.65
V2140 185 0.57
V2152 94 0.29
RESPONDENT_AGE 65 0.20
race 0 0.00
V13 0 0.00
sex 0 0.00
nicotine12d 0 0.00
wave 0 0.00
In [ ]:
# Correlation Analysis
# Select numeric variables (excluding the target variable 'nicotine12d' if desired).
cor_vars = new_data.drop(columns=['nicotine12d'], errors='ignore').select_dtypes(include=[np.number])
# Compute the Spearman correlation matrix.
cor_matrix_spearman = cor_vars.corr(method='spearman')
# Check for non-finite values in the correlation matrix.
if not np.all(np.isfinite(cor_matrix_spearman)):
print("\nWarning: Non-finite values detected in the correlation matrix.")
# Replace NaN or infinite values with 0 (or another appropriate value).
cor_matrix_spearman = cor_matrix_spearman.fillna(0)
cor_matrix_spearman = cor_matrix_spearman.replace([np.inf, -np.inf], 0)
print("\nSpearman Correlation Matrix:")
print(cor_matrix_spearman)
# Create an enhanced heatmap with clustering.
clustergrid = sns.clustermap(cor_matrix_spearman, cmap="coolwarm", figsize=(12, 12))
clustergrid.ax_heatmap.set_title("Enhanced Spearman Correlation Heatmap")
plt.show()
# Identify highly correlated pairs (absolute correlation > 0.5 and less than 1).
high_corr_pairs = []
cols = cor_matrix_spearman.columns
for i in range(len(cols)):
for j in range(i+1, len(cols)):
corr_value = cor_matrix_spearman.iloc[i, j]
if 0.5 < abs(corr_value) < 1:
high_corr_pairs.append({
"Variable1": cols[i],
"Variable2": cols[j],
"Correlation": corr_value
})
high_corr_df = pd.DataFrame(high_corr_pairs)
print("\nHighly correlated variable pairs (|corr| > 0.5):")
print(high_corr_df)
Spearman Correlation Matrix:
V2178 V2188 V2197 V2184 V2186 V2171 \
V2178 1.000000 -0.049629 0.093042 -0.015743 0.008092 0.046024
V2188 -0.049629 1.000000 -0.042752 0.321003 -0.026227 -0.102099
V2197 0.093042 -0.042752 1.000000 -0.035598 0.033113 0.020207
V2184 -0.015743 0.321003 -0.035598 1.000000 -0.065073 -0.059107
V2186 0.008092 -0.026227 0.033113 -0.065073 1.000000 0.022292
V2171 0.046024 -0.102099 0.020207 -0.059107 0.022292 1.000000
V2128 0.089366 -0.008868 0.071150 -0.007436 0.017063 0.014454
V2201 0.058376 0.004788 0.216064 0.014461 0.019671 -0.000423
V2173 -0.093280 0.228164 -0.017428 0.280610 -0.028203 -0.078977
V2194 0.104285 0.012967 0.136308 0.021631 0.001037 -0.024662
V2166 0.024141 -0.060672 -0.072844 -0.039552 -0.060795 0.041513
wave 0.008739 -0.057778 -0.006440 -0.082878 -0.043683 0.011258
V2176 0.414728 -0.035326 0.094930 -0.025964 0.002425 0.009332
V2175 0.097227 -0.023300 0.032360 0.004988 0.011282 0.010744
V2177 0.101125 0.010241 0.035401 0.025626 0.010682 -0.004172
V2116 0.213772 -0.020080 0.143596 -0.033305 0.018758 0.012502
V2125 0.091962 -0.028509 0.074941 -0.015441 0.021268 0.039509
V2182 0.044384 -0.039058 -0.019987 -0.028834 0.044192 -0.000364
sex 0.003834 0.144495 -0.063447 0.208078 -0.123320 -0.055563
race 0.061858 -0.002563 -0.049980 0.016777 -0.002884 0.022663
V2460 0.072343 -0.037442 0.064067 -0.011501 0.020640 0.059038
RESPONDENT_AGE 0.006229 -0.050273 0.052265 -0.043983 0.020336 0.022078
V2185 0.017683 -0.184175 0.026835 -0.164759 0.133994 0.027523
V2193 0.084958 0.036811 0.050691 0.066786 0.010752 -0.008691
V2163 -0.040773 0.055863 -0.007004 0.095256 -0.018676 0.002841
V49 0.026545 -0.051102 0.005809 -0.057999 0.046493 0.012928
V2108 0.138442 -0.010992 0.150654 0.008852 0.025111 -0.000746
V2101 0.136726 -0.093390 0.134853 -0.078096 0.064537 0.034624
V2180 0.032746 -0.202660 0.012872 -0.109447 0.086755 0.025446
V2164 -0.042054 0.041992 0.020642 0.116081 -0.014283 0.002094
V2191 0.088517 -0.061298 0.145793 -0.062489 0.066350 -0.007981
V2195 0.064231 -0.009012 0.140739 0.013028 0.012680 -0.004597
V2155 -0.043915 0.086994 -0.019294 0.039762 -0.010458 -0.064640
V2196 0.046534 0.020896 0.275856 0.021864 0.030154 -0.049781
V2189 -0.028700 0.382564 -0.027480 0.583240 -0.025607 -0.067127
V2179 -0.163579 0.220732 -0.040714 0.257089 -0.070399 -0.081527
V13 0.118681 -0.020140 0.017733 -0.015608 0.017816 0.033269
V2143 0.077951 -0.008014 0.059433 -0.009263 0.025706 0.016391
V2134 0.064991 -0.007664 0.042174 0.002483 0.011987 0.012277
V2172 0.058068 -0.254797 -0.003818 -0.258314 0.044257 0.091775
V2137 0.079653 -0.007028 0.056565 -0.001482 0.012795 0.025179
V2140 0.026645 -0.024031 0.031365 0.005124 0.000927 0.038686
V2105 0.172417 0.028762 0.175986 0.042125 0.032539 -0.015343
V2157 -0.013618 0.077304 -0.037788 0.044640 -0.006680 -0.048150
V2183 -0.058165 0.536921 -0.029821 0.526226 -0.088325 -0.113955
V2187 0.014367 -0.005225 -0.018045 -0.082054 0.050283 0.014671
V2181 0.017542 -0.095575 0.050725 -0.050498 0.534619 0.036806
V2152 0.040213 0.128085 -0.046285 0.142611 -0.055261 -0.035358
V2153 -0.024463 0.018920 -0.017911 -0.020044 0.004421 -0.027370
V2156 -0.027227 0.118125 -0.031946 0.093757 -0.028257 -0.085951
V2128 V2201 V2173 V2194 ... V2137 \
V2178 0.089366 0.058376 -0.093280 0.104285 ... 0.079653
V2188 -0.008868 0.004788 0.228164 0.012967 ... -0.007028
V2197 0.071150 0.216064 -0.017428 0.136308 ... 0.056565
V2184 -0.007436 0.014461 0.280610 0.021631 ... -0.001482
V2186 0.017063 0.019671 -0.028203 0.001037 ... 0.012795
V2171 0.014454 -0.000423 -0.078977 -0.024662 ... 0.025179
V2128 1.000000 0.054623 0.000138 0.069551 ... 0.307463
V2201 0.054623 1.000000 0.027688 0.096337 ... 0.031748
V2173 0.000138 0.027688 1.000000 0.042823 ... -0.011170
V2194 0.069551 0.096337 0.042823 1.000000 ... 0.048543
V2166 -0.035053 -0.062426 -0.163089 -0.091508 ... -0.007848
wave -0.065838 0.013591 -0.011573 -0.035767 ... -0.067752
V2176 0.097158 0.074899 -0.085093 0.098826 ... 0.080364
V2175 0.042698 0.027308 -0.088557 0.044307 ... 0.046814
V2177 0.039213 0.036957 -0.039010 0.042682 ... 0.026470
V2116 0.211496 0.113149 -0.053363 0.174373 ... 0.182534
V2125 0.279562 0.040517 -0.021766 0.061210 ... 0.250703
V2182 0.001720 -0.037450 -0.205418 -0.031576 ... 0.009013
sex -0.022753 0.018452 -0.018493 -0.062476 ... 0.000668
race -0.012443 -0.050088 -0.097910 -0.023764 ... 0.005081
V2460 0.167413 0.035037 -0.020208 0.028924 ... 0.144419
RESPONDENT_AGE 0.009429 0.009460 -0.014265 0.018860 ... 0.000563
V2185 0.015976 0.002617 -0.108135 -0.008964 ... 0.016480
V2193 0.037751 0.028148 0.018646 0.177822 ... 0.039115
V2163 0.006270 0.026093 0.140415 0.029167 ... -0.013144
V49 0.004094 -0.015206 -0.120524 -0.044455 ... -0.003172
V2108 0.194583 0.092103 0.011494 0.194339 ... 0.130007
V2101 0.212239 0.078118 -0.076818 0.107508 ... 0.176522
V2180 0.007736 -0.019815 -0.176829 -0.022944 ... 0.015510
V2164 0.009983 0.045771 0.159947 0.049470 ... -0.012324
V2191 0.049851 0.113170 -0.048448 0.054998 ... 0.031524
V2195 0.043004 0.096540 0.029210 0.287661 ... 0.036775
V2155 -0.010384 0.017098 0.119758 0.040539 ... -0.028719
V2196 0.059567 0.223303 0.127376 0.256581 ... 0.024912
V2189 0.003447 0.024631 0.260970 0.013663 ... 0.009314
V2179 -0.049856 0.017037 0.538661 0.004064 ... -0.044086
V13 0.012240 -0.030317 -0.085578 -0.028648 ... 0.023291
V2143 0.301238 0.037267 -0.014579 0.058836 ... 0.338213
V2134 0.225288 0.037230 -0.008580 0.024640 ... 0.318344
V2172 -0.003964 -0.045249 -0.302182 -0.047912 ... -0.000513
V2137 0.307463 0.031748 -0.011170 0.048543 ... 1.000000
V2140 0.087342 0.005181 -0.003708 0.011889 ... 0.128621
V2105 0.180318 0.132834 0.043948 0.225754 ... 0.138353
V2157 -0.026373 -0.008458 0.033399 -0.016665 ... -0.032433
V2183 -0.010688 0.026671 0.365098 0.045440 ... -0.012606
V2187 -0.002975 -0.028137 -0.169927 -0.023250 ... 0.007803
V2181 0.026554 0.019818 -0.064495 0.005395 ... 0.015473
V2152 0.010043 -0.008924 0.109886 0.029292 ... 0.002346
V2153 -0.007439 -0.005400 -0.008876 -0.021156 ... -0.014636
V2156 -0.027584 -0.000554 0.088407 0.015746 ... -0.032821
V2140 V2105 V2157 V2183 V2187 V2181 \
V2178 0.026645 0.172417 -0.013618 -0.058165 0.014367 0.017542
V2188 -0.024031 0.028762 0.077304 0.536921 -0.005225 -0.095575
V2197 0.031365 0.175986 -0.037788 -0.029821 -0.018045 0.050725
V2184 0.005124 0.042125 0.044640 0.526226 -0.082054 -0.050498
V2186 0.000927 0.032539 -0.006680 -0.088325 0.050283 0.534619
V2171 0.038686 -0.015343 -0.048150 -0.113955 0.014671 0.036806
V2128 0.087342 0.180318 -0.026373 -0.010688 -0.002975 0.026554
V2201 0.005181 0.132834 -0.008458 0.026671 -0.028137 0.019818
V2173 -0.003708 0.043948 0.033399 0.365098 -0.169927 -0.064495
V2194 0.011889 0.225754 -0.016665 0.045440 -0.023250 0.005395
V2166 -0.003089 -0.159747 -0.014198 -0.109900 0.086412 -0.047589
wave 0.000054 -0.068175 -0.002512 -0.094130 0.024373 -0.052886
V2176 0.029716 0.222134 -0.005094 -0.055107 0.014491 0.010430
V2175 0.017742 0.079870 -0.026631 -0.041746 0.053744 0.016969
V2177 0.014397 0.079985 0.007484 -0.004764 0.025826 0.013020
V2116 0.029111 0.532954 -0.045172 -0.034615 -0.017631 0.028145
V2125 0.191464 0.142629 -0.031526 -0.029624 -0.003327 0.029798
V2182 -0.001022 -0.097161 0.009732 -0.187248 0.594918 0.165268
sex -0.013659 0.018516 0.035184 0.199213 0.000268 -0.224100
race -0.005319 0.002361 0.098590 -0.061729 0.096779 -0.003990
V2460 0.214431 0.070677 -0.027215 -0.031959 -0.007748 0.039341
RESPONDENT_AGE 0.001903 0.024208 -0.041384 -0.049884 0.003464 0.030982
V2185 0.001801 -0.012093 -0.019729 -0.288984 0.185083 0.127662
V2193 0.020512 0.098869 -0.039690 0.058654 -0.000871 0.005188
V2163 0.002053 0.026985 -0.003156 0.128013 -0.098676 -0.023328
V49 -0.005730 -0.033637 0.178629 -0.110485 0.081026 0.074250
V2108 0.039752 0.542470 -0.014267 0.025048 -0.054777 0.014596
V2101 0.046630 0.390300 -0.071695 -0.109326 0.003128 0.075156
V2180 0.012439 -0.059889 -0.010921 -0.329192 0.191071 0.283127
V2164 0.000331 0.054901 -0.005803 0.136287 -0.115267 -0.018556
V2191 0.005816 0.132798 0.000377 -0.081921 0.047335 0.088670
V2195 0.011747 0.175860 -0.023720 0.012994 0.006767 0.026602
V2155 -0.024549 0.050861 0.230122 0.106558 -0.059579 -0.028031
V2196 0.008922 0.242067 -0.009056 0.083057 -0.081563 0.047621
V2189 -0.016317 0.070536 0.056400 0.394245 -0.055655 -0.081609
V2179 0.001356 -0.000841 0.036419 0.365207 -0.165824 -0.122872
V13 0.001935 -0.069998 -0.010225 -0.082091 0.124938 0.038828
V2143 0.151030 0.134530 -0.029176 -0.019911 0.003988 0.025124
V2134 0.126647 0.094124 -0.014274 -0.016830 0.009191 0.018325
V2172 0.011229 -0.102753 -0.035586 -0.356496 0.130779 0.090098
V2137 0.128621 0.138353 -0.032433 -0.012606 0.007803 0.015473
V2140 1.000000 0.025843 -0.027812 -0.013527 -0.006684 0.026274
V2105 0.025843 1.000000 -0.017055 0.067867 -0.082546 0.014312
V2157 -0.027812 -0.017055 1.000000 0.056553 0.011611 -0.017673
V2183 -0.013527 0.067867 0.056553 1.000000 -0.205304 -0.162375
V2187 -0.006684 -0.082546 0.011611 -0.205304 1.000000 0.052120
V2181 0.026274 0.014312 -0.017673 -0.162375 0.052120 1.000000
V2152 -0.011394 0.043009 0.044792 0.174220 -0.072245 -0.080834
V2153 -0.029016 -0.034165 0.003419 -0.007107 0.016330 0.012776
V2156 -0.042198 0.001628 0.269085 0.142227 -0.025125 -0.050770
V2152 V2153 V2156
V2178 0.040213 -0.024463 -0.027227
V2188 0.128085 0.018920 0.118125
V2197 -0.046285 -0.017911 -0.031946
V2184 0.142611 -0.020044 0.093757
V2186 -0.055261 0.004421 -0.028257
V2171 -0.035358 -0.027370 -0.085951
V2128 0.010043 -0.007439 -0.027584
V2201 -0.008924 -0.005400 -0.000554
V2173 0.109886 -0.008876 0.088407
V2194 0.029292 -0.021156 0.015746
V2166 0.003758 0.011218 -0.061093
wave -0.030708 -0.080296 -0.005674
V2176 -0.005422 -0.012170 -0.027676
V2175 -0.010290 -0.006780 -0.032018
V2177 -0.011738 0.000419 -0.013191
V2116 0.049691 -0.017990 -0.055293
V2125 -0.005147 -0.019401 -0.045825
V2182 -0.074204 0.026510 -0.040563
sex 0.009143 0.002239 0.028336
race 0.056905 -0.004022 0.051642
V2460 -0.003214 -0.023490 -0.035494
RESPONDENT_AGE -0.044644 0.003840 -0.049813
V2185 -0.096766 0.005720 -0.042205
V2193 0.059684 -0.020532 -0.006511
V2163 0.058871 0.001608 0.047499
V49 -0.057192 0.010428 -0.083682
V2108 0.023486 -0.028804 -0.004466
V2101 -0.060251 -0.023931 -0.095518
V2180 -0.111524 0.016348 -0.053369
V2164 0.041675 -0.022228 0.051417
V2191 -0.097052 -0.003669 -0.052103
V2195 -0.031077 -0.063056 -0.021474
V2155 0.032753 -0.021865 0.205573
V2196 -0.111577 -0.032775 0.036915
V2189 0.118544 -0.005428 0.092473
V2179 0.008602 -0.014649 0.087404
V13 0.025354 0.007890 -0.028204
V2143 -0.002386 -0.005256 -0.027044
V2134 0.000950 -0.011670 -0.032677
V2172 -0.149836 0.015815 -0.106495
V2137 0.002346 -0.014636 -0.032821
V2140 -0.011394 -0.029016 -0.042198
V2105 0.043009 -0.034165 0.001628
V2157 0.044792 0.003419 0.269085
V2183 0.174220 -0.007107 0.142227
V2187 -0.072245 0.016330 -0.025125
V2181 -0.080834 0.012776 -0.050770
V2152 1.000000 -0.012334 0.094300
V2153 -0.012334 1.000000 0.018627
V2156 0.094300 0.018627 1.000000
[50 rows x 50 columns]
Highly correlated variable pairs (|corr| > 0.5): Variable1 Variable2 Correlation 0 V2188 V2183 0.536921 1 V2184 V2189 0.583240 2 V2184 V2183 0.526226 3 V2186 V2181 0.534619 4 V2173 V2179 0.538661 5 V2116 V2105 0.532954 6 V2182 V2187 0.594918 7 V2185 V2180 0.507020 8 V2108 V2105 0.542470
In [ ]:
# --- Identify & Convert Categorical Columns ---
import logging
# Identify all categorical (object or categorical dtype) columns
categorical_predictor_cols = new_data.select_dtypes(include=['object', 'category']).columns.tolist()
# If you also want to include numerical columns as categorical (optional)
# categorical_predictor_cols = new_data.columns.tolist()
# Convert identified columns to categorical
convert_to_categorical(new_data, categorical_predictor_cols)
# Logging information
logging.info("Verifying data types after conversion:")
logging.info(new_data[categorical_predictor_cols].dtypes)
# --- Train/Test Split ---
X = new_data.drop('nicotine12d', axis=1)
y = new_data['nicotine12d']
X_train, X_test, y_train, y_test = create_train_test_split(X, y)
logging.info("Train Set Balance:")
logging.info(y_train.value_counts(normalize=True))
logging.info("Test Set Balance:")
logging.info(y_test.value_counts(normalize=True))
# --- Missing Value Indicators ---
X_train_with_indicators, X_test_with_indicators = create_missing_indicators(X_train, X_test)
# Treat everything as categorical in this example
categorical_features = X_train_with_indicators.columns.tolist()
# Create & Fit Preprocessor
preprocessor = create_preprocessor(categorical_features)
preprocessor.fit(X_train_with_indicators)
logging.info("Preprocessor fitted successfully.")
2025-02-15 11:02:51,511 - INFO - Categorical conversion complete. 2025-02-15 11:02:51,512 - INFO - Verifying data types after conversion: 2025-02-15 11:02:51,513 - INFO - Series([], dtype: object) 2025-02-15 11:02:51,512 - INFO - Verifying data types after conversion: 2025-02-15 11:02:51,513 - INFO - Series([], dtype: object) 2025-02-15 11:02:51,538 - INFO - Training set shape: (26184, 50) 2025-02-15 11:02:51,539 - INFO - Testing set shape: (6546, 50) 2025-02-15 11:02:51,539 - INFO - Train Set Balance: 2025-02-15 11:02:51,541 - INFO - nicotine12d 1.0 0.552589 0.0 0.447411 Name: proportion, dtype: float64 2025-02-15 11:02:51,542 - INFO - Test Set Balance: 2025-02-15 11:02:51,543 - INFO - nicotine12d 1.0 0.552551 0.0 0.447449 Name: proportion, dtype: float64 2025-02-15 11:02:51,567 - INFO - Missing indicators created. 2025-02-15 11:02:52,047 - INFO - Preprocessor fitted successfully.
Model Training¶
Lasso¶
In [ ]:
# Define the preprocessing for numeric columns (scale them)
numeric_features = X_train_with_indicators.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
# Define the preprocessing for categorical features (encode them)
categorical_features = X_train_with_indicators.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
# Combine preprocessing steps
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
# Create the pipeline
lasso_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression(penalty='l1', solver='saga'))])
# Define an expanded tuning grid.
# - 'classifier__C': A wide range of regularization strengths.
# - 'classifier__tol': Different tolerance levels for stopping criteria.
# - 'classifier__max_iter': More iterations to ensure convergence.
# - 'preprocessor__cat__drop': Option to drop the first level or keep all levels.
param_grid = {
'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
'classifier__tol': [1e-4, 1e-3, 1e-2],
'classifier__max_iter': [1000, 2000, 5000],
# Tune whether to drop the first level for categorical features or not.
'preprocessor__cat__onehot__drop': [None, 'first'],
# Experiment with class weights (None or 'balanced') to help if classes are imbalanced.
'classifier__class_weight': [None, 'balanced']
}
# Define a cross-validation strategy.
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
# Initialize GridSearchCV with your pipeline (lasso_pipeline)
grid_search = GridSearchCV(
estimator=lasso_pipeline,
param_grid=param_grid,
scoring='roc_auc',
cv=cv,
n_jobs=-1,
verbose=1
)
# Fit the grid search on the training data.
grid_search.fit(X_train_with_indicators, y_train)
# Display the best parameters and the best ROC AUC achieved during cross-validation.
print("Best Parameters:", grid_search.best_params_)
print("Best ROC AUC:", grid_search.best_score_)
# Use the best estimator to evaluate performance on the test data.
best_lasso_model = grid_search.best_estimator_
train_evaluate_model(
model=best_lasso_model,
X_train=X_train_with_indicators,
y_train=y_train,
X_test=X_test_with_indicators,
y_test=y_test,
model_name="Tuned LASSO Logistic Regression"
)
Fitting 5 folds for each of 252 candidates, totalling 1260 fits
Best Parameters: {'classifier__C': 100, 'classifier__class_weight': 'balanced', 'classifier__max_iter': 2000, 'classifier__tol': 0.01, 'preprocessor__cat__onehot__drop': 'first'}
Best ROC AUC: 0.745313422854059
2025-02-14 13:46:51,624 - INFO - === Tuned LASSO Logistic Regression Evaluation ===
2025-02-14 13:46:51,632 - INFO - Confusion Matrix:
[[2030 899]
[ 941 2676]]
2025-02-14 13:46:51,646 - INFO -
Classification Report:
precision recall f1-score support
0.0 0.68 0.69 0.69 2929
1.0 0.75 0.74 0.74 3617
accuracy 0.72 6546
macro avg 0.72 0.72 0.72 6546
weighted avg 0.72 0.72 0.72 6546
2025-02-14 13:46:51,651 - INFO - ROC AUC: 0.7425
Out[ ]:
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
Index(['V2178', 'V2188', 'V2197', 'V2184', 'V2186', 'V2171', 'V2128', 'V2201',
'V2173', 'V2194', 'V2166', 'wave', 'V2176', 'V2175', 'V2177', 'V2116',
'V2125', 'V2182', 'sex', 'race', 'V2460', 'RESPONDENT_AGE',...
'V2172', 'V2137', 'V2140', 'V2105', 'V2157', 'V2183', 'V2187', 'V2181',
'V2152', 'V2153', 'V2156'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
Index([], dtype='object'))])),
('classifier',
LogisticRegression(C=100, class_weight='balanced',
max_iter=2000, penalty='l1', solver='saga',
tol=0.01))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
Index(['V2178', 'V2188', 'V2197', 'V2184', 'V2186', 'V2171', 'V2128', 'V2201',
'V2173', 'V2194', 'V2166', 'wave', 'V2176', 'V2175', 'V2177', 'V2116',
'V2125', 'V2182', 'sex', 'race', 'V2460', 'RESPONDENT_AGE',...
'V2172', 'V2137', 'V2140', 'V2105', 'V2157', 'V2183', 'V2187', 'V2181',
'V2152', 'V2153', 'V2156'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
Index([], dtype='object'))])),
('classifier',
LogisticRegression(C=100, class_weight='balanced',
max_iter=2000, penalty='l1', solver='saga',
tol=0.01))])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler', StandardScaler())]),
Index(['V2178', 'V2188', 'V2197', 'V2184', 'V2186', 'V2171', 'V2128', 'V2201',
'V2173', 'V2194', 'V2166', 'wave', 'V2176', 'V2175', 'V2177', 'V2116',
'V2125', 'V2182', 'sex', 'race', 'V2460', 'RESPONDENT_AGE', 'V2185',
'V2193', 'V2163', 'V49', 'V...
'V2195', 'V2155', 'V2196', 'V2189', 'V2179', 'V13', 'V2143', 'V2134',
'V2172', 'V2137', 'V2140', 'V2105', 'V2157', 'V2183', 'V2187', 'V2181',
'V2152', 'V2153', 'V2156'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(fill_value='missing',
strategy='constant')),
('onehot',
OneHotEncoder(drop='first',
handle_unknown='ignore'))]),
Index([], dtype='object'))])Index(['V2178', 'V2188', 'V2197', 'V2184', 'V2186', 'V2171', 'V2128', 'V2201',
'V2173', 'V2194', 'V2166', 'wave', 'V2176', 'V2175', 'V2177', 'V2116',
'V2125', 'V2182', 'sex', 'race', 'V2460', 'RESPONDENT_AGE', 'V2185',
'V2193', 'V2163', 'V49', 'V2108', 'V2101', 'V2180', 'V2164', 'V2191',
'V2195', 'V2155', 'V2196', 'V2189', 'V2179', 'V13', 'V2143', 'V2134',
'V2172', 'V2137', 'V2140', 'V2105', 'V2157', 'V2183', 'V2187', 'V2181',
'V2152', 'V2153', 'V2156'],
dtype='object')SimpleImputer(strategy='median')
StandardScaler()
Index([], dtype='object')
SimpleImputer(fill_value='missing', strategy='constant')
OneHotEncoder(drop='first', handle_unknown='ignore')
LogisticRegression(C=100, class_weight='balanced', max_iter=2000, penalty='l1',
solver='saga', tol=0.01)In [ ]:
import numpy as np
import pandas as pd
# Assume 'best_lasso_model' is your already fitted pipeline from GridSearchCV.
# Extract the logistic regression model from the pipeline.
lr = best_lasso_model.named_steps['classifier']
# For binary classification, lr.coef_ has shape (1, n_features)
coefficients = lr.coef_[0]
# Get the preprocessor (the ColumnTransformer) from the pipeline.
preprocessor = best_lasso_model.named_steps['preprocessor']
# -------------------------------
# 1. Numeric Features and Importances
# -------------------------------
# The numeric transformer was applied first.
numeric_features = preprocessor.transformers_[0][2] # list (or Index) of numeric feature names
n_numeric = len(numeric_features)
numeric_coefs = coefficients[:n_numeric]
numeric_importances = pd.Series(np.abs(numeric_coefs), index=numeric_features)
# -------------------------------
# 2. Categorical Features (Aggregation)
# -------------------------------
# Get the original categorical columns from the transformer.
cat_features = preprocessor.transformers_[1][2]
# Check if there are any categorical features
if len(cat_features) > 0:
# Retrieve the OneHotEncoder from the categorical pipeline.
onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
# The remaining coefficients correspond to the one-hot encoded features.
categorical_coefs = coefficients[n_numeric:]
aggregated_cat_importance = {}
start_idx = 0
# Loop over each original categorical feature and its categories.
for feature, categories in zip(cat_features, onehot_encoder.categories_):
n_categories = len(categories)
# Get the coefficients for the dummy columns of this feature.
feature_coefs = categorical_coefs[start_idx:start_idx + n_categories]
# Aggregate by summing the absolute values.
aggregated_cat_importance[feature] = np.sum(np.abs(feature_coefs))
start_idx += n_categories
aggregated_cat_importance = pd.Series(aggregated_cat_importance)
else:
# If there are no categorical features, create an empty Series.
aggregated_cat_importance = pd.Series(dtype=float)
# -------------------------------
# 3. Combine and Select Top 20
# -------------------------------
combined_importances = pd.concat([numeric_importances, aggregated_cat_importance])
top20_features = combined_importances.sort_values(ascending=False).head(20)
print("Top 20 Aggregated Feature Importances (by absolute coefficient value):")
print(top20_features)
# Plot the top 20 features
plt.figure(figsize=(10, 6))
sns.barplot(x=top20_features.values, y=top20_features.index, palette="viridis")
plt.title('Top 20 Aggregated Feature Importances (by absolute coefficient value)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()
Top 20 Aggregated Feature Importances (by absolute coefficient value): wave 0.917893 V2116 0.204811 V2137 0.107522 sex 0.096266 V2105 0.092966 V2166 0.086576 V2134 0.079518 V2128 0.074483 V13 0.067082 V2101 0.058904 V2143 0.052490 V2176 0.051462 V2187 0.046222 V2179 0.045720 V2193 0.045389 V2153 0.045158 V2188 0.039777 V2157 0.038560 V2182 0.038260 V2194 0.037967 dtype: float64
/tmp/ipykernel_1545623/809380159.py:64: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=top20_features.values, y=top20_features.index, palette="viridis")
In [ ]:
#########################################
# 2. Permutation Importance (Aggregated by Original Feature)
#########################################
from sklearn.inspection import permutation_importance
# Compute permutation importance using the original features (X_test_with_indicators).
perm_results = permutation_importance(
best_lasso_model,
X_test_with_indicators,
y_test,
scoring='roc_auc',
n_repeats=10,
random_state=RANDOM_STATE
)
perm_imp_df = pd.DataFrame({
'Feature': X_test_with_indicators.columns,
'Importance': perm_results.importances_mean
}).sort_values(by='Importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=perm_imp_df.head(20), palette='magma')
plt.title("Permutation Importance (Aggregated Original Features)")
plt.xlabel("Mean Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()
# Display the top 20 features by permutation importance
print(perm_imp_df.head(20))
/tmp/ipykernel_1489303/241353914.py:23: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='Importance', y='Feature', data=perm_imp_df.head(20), palette='magma')
Feature Importance 11 wave 0.218731 15 V2116 0.016337 42 V2105 0.004965 40 V2137 0.003549 38 V2134 0.002059 10 V2166 0.001284 18 sex 0.001066 6 V2128 0.000694 35 V2179 0.000675 1 V2188 0.000622 45 V2187 0.000611 23 V2193 0.000493 26 V2108 0.000476 8 V2173 0.000444 43 V2157 0.000364 13 V2175 0.000297 36 V13 0.000294 7 V2201 0.000282 30 V2191 0.000265 31 V2195 0.000249
In [ ]:
import shap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Get feature names (only using available features)
feature_names = numeric_features
# Calculate SHAP values for numeric features
explainer = shap.LinearExplainer(
best_lasso_model.named_steps['classifier'],
best_lasso_model.named_steps['preprocessor'].transform(X_train_with_indicators)
)
shap_values = explainer.shap_values(
best_lasso_model.named_steps['preprocessor'].transform(X_train_with_indicators)
)
# Calculate feature importance (using absolute mean SHAP values)
feature_importance = {}
for idx, feature in enumerate(feature_names):
if idx < shap_values.shape[1]: # Only process features within bounds
feature_importance[feature] = np.abs(shap_values[:, idx]).mean()
# Convert to DataFrame and sort
importance_df = pd.DataFrame({
'Feature': list(feature_importance.keys()),
'Importance': list(feature_importance.values())
}).sort_values('Importance', ascending=False)
# Display top 20 features
print("\nTop 20 Important Features:")
print(importance_df.head(20))
# Create visualization
plt.figure(figsize=(12, 8))
sns.barplot(data=importance_df.head(20), x='Importance', y='Feature')
plt.title('Top 20 Feature Importance Based on SHAP Values')
plt.xlabel('Mean Absolute SHAP Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
Top 20 Important Features: Feature Importance 11 wave 0.831211 15 V2116 0.154883 18 sex 0.096673 42 V2105 0.079273 10 V2166 0.075389 36 V13 0.059174 27 V2101 0.049204 45 V2187 0.042473 43 V2157 0.037265 1 V2188 0.036909 35 V2179 0.035972 12 V2176 0.033787 23 V2193 0.033327 17 V2182 0.032703 9 V2194 0.029956 30 V2191 0.027242 48 V2153 0.026061 8 V2173 0.024039 13 V2175 0.023854 40 V2137 0.021965
In [ ]:
from sklearn.inspection import PartialDependenceDisplay
import matplotlib.pyplot as plt
top_10_features = importance_df.head(10)['Feature'].tolist()
print("Top 10 Features:", top_10_features)
for feature in top_10_features:
print(f"{feature}: {X_train_with_indicators[feature].dtype}")
print("Unique Values in Top 10 Features:")
for feature in top_10_features:
unique_values = X_train_with_indicators[feature].unique()
print(f"{feature}: {unique_values}")
fig, axes = plt.subplots(2, 5, figsize=(20, 10))
axes = axes.flatten()
for i, feature in enumerate(top_10_features):
if len(X_train_with_indicators[feature].unique()) > 1:
try:
PartialDependenceDisplay.from_estimator(
best_lasso_model,
X_train_with_indicators,
features=[feature],
ax=axes[i]
)
axes[i].set_title(f'PDP for {feature}')
except ValueError as e:
print(f"Error plotting PDP for {feature}: {e}")
else:
print(f"Skipping PDP for {feature}: Only one unique value in the dataset.")
axes[i].set_visible(False)
plt.tight_layout()
plt.show()
Top 10 Features: ['wave', 'V2116', 'sex', 'V2105', 'V2166', 'V13', 'V2101', 'V2187', 'V2157', 'V2188'] wave: int64 V2116: float64 sex: float64 V2105: float64 V2166: float64 V13: int64 V2101: float64 V2187: float64 V2157: float64 V2188: float64 Unique Values in Top 10 Features: wave: [2019 2018 2017 2022 2023 2021 2020] V2116: [nan 2. 1. 3. 7. 4. 6. 5.] sex: [0. 1.] V2105: [ 1. 6. 2. 3. 4. 5. 7. nan] V2166: [nan 2. 4. 6. 1. 5. 3. 8. 7.] V13: [2 4 3 1] V2101: [ 1. 2. 3. 4. nan 5.] V2187: [ 0. 1. nan] V2157: [ 1. 0. nan] V2188: [ 1. 0. nan] Error plotting PDP for sex: cannot reshape array of size 1 into shape (2)
In [ ]:
#############################
# Degree 2 Interaction
###############################
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
# Set a random state for reproducibility.
RANDOM_STATE = 42
# ----------------------------
# 1. Build the Pipeline
# ----------------------------
# This pipeline consists of:
# - preprocessor: your existing preprocessor for data cleaning/encoding.
# - poly: PolynomialFeatures with degree 2 (pairwise interactions only, no bias).
# - classifier: LogisticRegression with L1 penalty (sparse model) using the liblinear solver.
pipeline = Pipeline([
('preprocessor', preprocessor),
('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
('classifier', LogisticRegression(
penalty='l1',
solver='liblinear',
random_state=RANDOM_STATE,
max_iter=1000
))
])
# ----------------------------
# 2. Set Up Hyperparameter Tuning
# ----------------------------
# Here we define a parameter grid for tuning.
# In this example, we tune the inverse regularization strength 'C' for logistic regression.
param_grid = {
'classifier__C': [0.001, 0.01, 0.1, 1],
'classifier__penalty': ['l1', 'l2'],
'classifier__solver': ['liblinear'], # 'saga' could also be tested if using larger datasets
'classifier__max_iter': [500, 1000],
'classifier__tol': [1e-4, 1e-3, 1e-2]
}
# ----------------------------
# 3. Create and Fit GridSearchCV
# ----------------------------
# We use 5-fold cross-validation and accuracy as the scoring metric.
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1, # Use all available CPU cores.
verbose=1
)
# Fit the grid search on the training data.
grid_search.fit(X_train_with_indicators, y_train)
# ----------------------------
# 4. Evaluate the Best Model
# ----------------------------
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))
# Use the best model to predict on the test set.
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_with_indicators)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.4f}".format(test_accuracy))
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best hyperparameters: {'classifier__C': 0.01, 'classifier__max_iter': 500, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'classifier__tol': 0.01}
Best cross-validation accuracy: 0.8037
Test set accuracy: 0.8093
In [ ]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
accuracy_score,
roc_auc_score,
f1_score,
classification_report,
roc_curve # For ROC curve
)
import matplotlib.pyplot as plt # For plotting
# ----------------------------
# 4. Evaluate the Best Model
# ----------------------------
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))
# Use the best model to predict on the test set.
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_with_indicators)
y_pred_proba = best_model.predict_proba(X_test_with_indicators)[:, 1] # Get probabilities for the positive class
# Calculate additional metrics
test_accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)
print("Test set accuracy: {:.4f}".format(test_accuracy))
print("Test set ROC AUC: {:.4f}".format(roc_auc))
print("Test set F1 score: {:.4f}".format(f1))
# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# ----------------------------
# 5. Plot the ROC Curve
# ----------------------------
# Compute FPR, TPR, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})", color='blue')
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess") # Diagonal line for reference
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
Best hyperparameters: {'classifier__C': 0.01, 'classifier__max_iter': 500, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'classifier__tol': 0.01}
Best cross-validation accuracy: 0.8037
Test set accuracy: 0.8093
Test set ROC AUC: 0.8758
Test set F1 score: 0.8286
Classification Report:
precision recall f1-score support
0.0 0.79 0.78 0.79 2929
1.0 0.82 0.83 0.83 3617
accuracy 0.81 6546
macro avg 0.81 0.81 0.81 6546
weighted avg 0.81 0.81 0.81 6546
In [ ]:
# -----------------------------
# Step 1: Retrieve One-Hot Encoded Feature Names
# -----------------------------
# Access the pipeline for categorical features
num_pipeline = best_model.named_steps['preprocessor'].named_transformers_['num']
# Then access the OneHotEncoder within that pipeline
ohe = num_pipeline.named_steps['scaler']
# Now get the encoded feature names
encoded_feature_names = ohe.get_feature_names_out(numeric_features)
# -----------------------------
# Step 2: Retrieve Interaction Feature Names
# -----------------------------
# Get the feature names after applying PolynomialFeatures (which created interaction terms)
interaction_transformer = best_model.named_steps['poly'] # Corrected step name here
interaction_feature_names = interaction_transformer.get_feature_names_out(encoded_feature_names)
# -----------------------------
# Step 3: Extract Classifier Coefficients
# -----------------------------
# For binary classification, the classifier’s coef_ is an array of shape (1, n_features)
coefficients = best_model.named_steps['classifier'].coef_[0]
# Build a DataFrame mapping each expanded feature (both main effects and interactions) to its coefficient
features_df = pd.DataFrame({
'interaction_feature': interaction_feature_names,
'coefficient': coefficients,
'abs_coef': np.abs(coefficients)
})
# -----------------------------
# Step 4: Filter for Interaction Features Only
# -----------------------------
# With interaction_only=True, main effects do not contain a space, while interaction terms do.
interaction_df = features_df[features_df['interaction_feature'].str.contains(' ')].copy()
# -----------------------------
# Step 5: Aggregate to Original Feature Combinations
# -----------------------------
# Define a function to extract the original feature names from an interaction term.
def extract_original_features(interaction_term):
# For example, "V13_A V2152_B" or "V13_A V2152_B V49_Low"
parts = interaction_term.split(' ')
# Extract the original feature name from each part (everything before the underscore)
original_features = [part.split('_')[0] for part in parts]
# Sort the features so order doesn't matter (e.g., ('V13', 'V2152') is the same as ('V2152', 'V13'))
return tuple(sorted(original_features))
# Create a new column for the aggregated original feature combination
interaction_df['feature_combination'] = interaction_df['interaction_feature'].apply(extract_original_features)
# Group by the original feature combination and sum the absolute coefficient values as a measure of importance
agg_interactions = (
interaction_df.groupby('feature_combination')['abs_coef']
.sum()
.reset_index()
.rename(columns={'abs_coef': 'aggregated_importance'})
)
# Sort the aggregated interactions by importance in descending order
agg_interactions = agg_interactions.sort_values('aggregated_importance', ascending=False)
# -----------------------------
# Step 6: Display the Top 20 Aggregated Interaction Features
# -----------------------------
top20_agg_interactions = agg_interactions.head(20)
print("Top 20 Aggregated Interaction Features by Summed Absolute Coefficient:")
print(top20_agg_interactions)
# Optionally, plot the results.
plt.figure(figsize=(10, 6))
sns.barplot(
x='aggregated_importance',
y=top20_agg_interactions['feature_combination'].astype(str),
data=top20_agg_interactions,
palette='viridis'
)
plt.title("Top 20 Aggregated Interaction Features")
plt.xlabel("Aggregated Importance (Sum of |Coefficients|)")
plt.ylabel("Feature Combination")
plt.tight_layout()
plt.show()
Top 20 Aggregated Interaction Features by Summed Absolute Coefficient:
feature_combination aggregated_importance
189 (V2105, wave) 0.664864
278 (V2116, wave) 0.587994
143 (V2101, wave) 0.406701
1203 (V2196, wave) 0.244442
1188 (V2194, wave) 0.116426
789 (V2166, wave) 0.114012
1214 (V2201, wave) 0.110926
1169 (V2191, wave) 0.092729
1088 (V2184, wave) 0.070952
1196 (V2195, wave) 0.064390
873 (V2173, wave) 0.062596
96 (V13, wave) 0.058666
1224 (sex, wave) 0.042238
924 (V2176, wave) 0.038159
1146 (V2188, wave) 0.035029
818 (V2171, wave) 0.031464
899 (V2175, wave) 0.029978
1119 (V2186, wave) 0.027526
558 (V2152, wave) 0.027012
1181 (V2194, V2196) 0.026777
/tmp/ipykernel_1545623/3475757056.py:73: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(
In [ ]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
# Set a random state for reproducibility.
RANDOM_STATE = 42
# ----------------------------
# 1. Build the Pipeline
# ----------------------------
# This pipeline consists of:
# - preprocessor: your existing preprocessor for data cleaning/encoding.
# - poly: PolynomialFeatures with degree 2 (pairwise interactions only, no bias).
# - classifier: LogisticRegression with L1 penalty (sparse model) using the liblinear solver.
pipeline = Pipeline([
('preprocessor', preprocessor),
('poly', PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)),
('classifier', LogisticRegression(
penalty='l1',
solver='liblinear',
random_state=RANDOM_STATE,
max_iter=500
))
])
# ----------------------------
# 2. Set Up Hyperparameter Tuning
# ----------------------------
# Here we define a parameter grid for tuning.
# In this example, we tune the inverse regularization strength 'C' for logistic regression.
param_grid = {
'classifier__C': [0.01, 0.1, 1],
'classifier__penalty': ['l1'],
'classifier__solver': ['liblinear'],
'classifier__max_iter': [500, 1000],
'classifier__tol': [1e-4]
}
# ----------------------------
# 3. Create and Fit GridSearchCV
# ----------------------------
# We use 5-fold cross-validation and accuracy as the scoring metric.
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=3,
scoring='accuracy',
n_jobs=8, # need to adjust accordingly to precent memory overflow.
verbose=1
)
# Fit the grid search on the training data.
grid_search.fit(X_train_with_indicators, y_train)
# ----------------------------
# 4. Evaluate the Best Model
# ----------------------------
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))
# Use the best model to predict on the test set.
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_with_indicators)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.4f}".format(test_accuracy))
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best hyperparameters: {'classifier__C': 0.01, 'classifier__max_iter': 500, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'classifier__tol': 0.0001}
Best cross-validation accuracy: 0.8030
Test set accuracy: 0.8008
In [ ]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
accuracy_score,
roc_auc_score,
f1_score,
classification_report,
roc_curve # For ROC curve
)
import matplotlib.pyplot as plt # For plotting
# ----------------------------
# 4. Evaluate the Best Model
# ----------------------------
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))
# Use the best model to predict on the test set.
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_with_indicators)
y_pred_proba = best_model.predict_proba(X_test_with_indicators)[:, 1] # Get probabilities for the positive class
# Calculate additional metrics
test_accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1 = f1_score(y_test, y_pred)
print("Test set accuracy: {:.4f}".format(test_accuracy))
print("Test set ROC AUC: {:.4f}".format(roc_auc))
print("Test set F1 score: {:.4f}".format(f1))
# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# ----------------------------
# 5. Plot the ROC Curve
# ----------------------------
# Compute FPR, TPR, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})", color='blue')
plt.plot([0, 1], [0, 1], 'k--', label="Random Guess") # Diagonal line for reference
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
Best hyperparameters: {'classifier__C': 0.01, 'classifier__max_iter': 500, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'classifier__tol': 0.0001}
Best cross-validation accuracy: 0.8030
Test set accuracy: 0.8008
Test set ROC AUC: 0.8729
Test set F1 score: 0.8210
Classification Report:
precision recall f1-score support
0.0 0.78 0.77 0.78 2929
1.0 0.82 0.83 0.82 3617
accuracy 0.80 6546
macro avg 0.80 0.80 0.80 6546
weighted avg 0.80 0.80 0.80 6546
In [ ]:
# -----------------------------
# Step 1: Retrieve One-Hot Encoded Feature Names
# -----------------------------
# Access the pipeline for categorical features
num_pipeline = best_model.named_steps['preprocessor'].named_transformers_['num']
# Then access the OneHotEncoder within that pipeline
ohe = num_pipeline.named_steps['scaler']
# Now get the encoded feature names
encoded_feature_names = ohe.get_feature_names_out(numeric_features)
# -----------------------------
# Step 2: Retrieve Interaction Feature Names
# -----------------------------
# Get the feature names after applying PolynomialFeatures (which created interaction terms)
interaction_transformer = best_model.named_steps['poly'] # Corrected step name here
interaction_feature_names = interaction_transformer.get_feature_names_out(encoded_feature_names)
# -----------------------------
# Step 3: Extract Classifier Coefficients
# -----------------------------
# For binary classification, the classifier’s coef_ is an array of shape (1, n_features)
coefficients = best_model.named_steps['classifier'].coef_[0]
# Build a DataFrame mapping each expanded feature (both main effects and interactions) to its coefficient
features_df = pd.DataFrame({
'interaction_feature': interaction_feature_names,
'coefficient': coefficients,
'abs_coef': np.abs(coefficients)
})
# -----------------------------
# Step 4: Filter for Interaction Features Only
# -----------------------------
# With interaction_only=True, main effects do not contain a space, while interaction terms do.
interaction_df = features_df[features_df['interaction_feature'].str.contains(' ')].copy()
# -----------------------------
# Step 5: Aggregate to Original Feature Combinations
# -----------------------------
# Define a function to extract the original feature names from an interaction term.
def extract_original_features(interaction_term):
# For example, "V13_A V2152_B" or "V13_A V2152_B V49_Low"
parts = interaction_term.split(' ')
# Extract the original feature name from each part (everything before the underscore)
original_features = [part.split('_')[0] for part in parts]
# Sort the features so order doesn't matter (e.g., ('V13', 'V2152') is the same as ('V2152', 'V13'))
return tuple(sorted(original_features))
# Create a new column for the aggregated original feature combination
interaction_df['feature_combination'] = interaction_df['interaction_feature'].apply(extract_original_features)
# Group by the original feature combination and sum the absolute coefficient values as a measure of importance
agg_interactions = (
interaction_df.groupby('feature_combination')['abs_coef']
.sum()
.reset_index()
.rename(columns={'abs_coef': 'aggregated_importance'})
)
# Sort the aggregated interactions by importance in descending order
agg_interactions = agg_interactions.sort_values('aggregated_importance', ascending=False)
# -----------------------------
# Step 6: Display the Top 20 Aggregated Interaction Features
# -----------------------------
top20_agg_interactions = agg_interactions.head(20)
print("Top 20 Aggregated Interaction Features by Summed Absolute Coefficient:")
print(top20_agg_interactions)
# Optionally, plot the results.
plt.figure(figsize=(10, 6))
sns.barplot(
x='aggregated_importance',
y=top20_agg_interactions['feature_combination'].astype(str),
data=top20_agg_interactions,
palette='viridis'
)
plt.title("Top 20 Aggregated Interaction Features")
plt.xlabel("Aggregated Importance (Sum of |Coefficients|)")
plt.ylabel("Feature Combination")
plt.tight_layout()
plt.show()
Top 20 Aggregated Interaction Features by Summed Absolute Coefficient:
feature_combination aggregated_importance
6634 (V2116, wave) 0.732379
4609 (V2105, wave) 0.708883
3528 (V2101, wave) 0.525570
20768 (V2196, wave) 0.228015
3619 (V2105, V2116, wave) 0.192851
5644 (V2108, wave) 0.167442
20704 (V2194, wave) 0.127520
20804 (V2201, wave) 0.115384
2447 (V2101, V2105, wave) 0.102783
16329 (V2166, wave) 0.098145
20604 (V2191, wave) 0.092698
20740 (V2195, wave) 0.084765
3574 (V2105, V2108, wave) 0.083104
2397 (V13, race, wave) 0.080824
18224 (V2176, wave) 0.079418
17548 (V2173, wave) 0.063086
2538 (V2101, V2116, wave) 0.050391
14872 (V2163, V2164, wave) 0.047727
20008 (V2184, wave) 0.047025
20549 (V2191, V2193, wave) 0.042181
/tmp/ipykernel_1545623/3089346036.py:72: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(
Random Forest Classifier¶
In [ ]:
# Define Random State for reproducibility
RANDOM_STATE = 42
# Use RepeatedStratifiedKFold for more robust validation
N_SPLITS_CV = 5
N_REPEATS = 1 # repeat the CV multiple times if desired
SCORING_METRIC = 'roc_auc'
VERBOSE = 1
logging.info("\n--- Random Forest (Revised) ---")
# Build pipeline
rf_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(random_state=RANDOM_STATE))
])
# Parameter for RandomizedSearch
rf_param_dist = {
'classifier__n_estimators': [100, 200, 500, 1000],
'classifier__max_depth': [5, 10, 20, 50],
'classifier__min_samples_split': [2, 5, 10, 20],
'classifier__min_samples_leaf': [1, 2, 5, 10],
'classifier__max_features': ['sqrt', 'log2', 0.3, 0.5, 0.7], # Mix of float and string
'classifier__bootstrap': [True, False],
'classifier__class_weight': [None, 'balanced']
}
try:
logging.info("Starting randomized search for Random Forest...")
# Use RepeatedStratifiedKFold without shuffle
cv_rf = RepeatedStratifiedKFold(
n_splits=N_SPLITS_CV,
n_repeats=N_REPEATS,
random_state=RANDOM_STATE
)
# RandomizedSearchCV to cover more combinations within reasonable compute time
rf_random_search = RandomizedSearchCV(
estimator=rf_pipeline,
param_distributions=rf_param_dist,
n_iter=30, # Increase or decrease based on resources
cv=cv_rf,
scoring=SCORING_METRIC,
n_jobs=-1, # Use all available cores
random_state=RANDOM_STATE,
verbose=VERBOSE
)
# Fit the RandomizedSearchCV
rf_random_search.fit(X_train_with_indicators, y_train)
logging.info(f"Best parameters (RF): {rf_random_search.best_params_}")
logging.info(f"Best cross-validation {SCORING_METRIC}: {rf_random_search.best_score_:.4f}")
# Extract the best estimator
best_rf = rf_random_search.best_estimator_
except Exception as e:
logging.error(f"An error occurred during Random Forest randomized search: {e}")
raise
# Evaluate the best Random Forest
try:
best_rf.fit(X_train_with_indicators, y_train)
y_pred_rf = best_rf.predict(X_test_with_indicators)
y_pred_proba_rf = best_rf.predict_proba(X_test_with_indicators)[:, 1]
logging.info("=== Best Random Forest Evaluation ===")
logging.info("Confusion Matrix:\n" + str(confusion_matrix(y_test, y_pred_rf)))
logging.info("\nClassification Report:\n" + str(classification_report(y_test, y_pred_rf)))
logging.info(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_rf):.4f}")
# Plot ROC
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, label=f'AUC = {roc_auc_score(y_test, y_pred_proba_rf):.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest ROC Curve on Test Data')
plt.legend(loc='lower right')
plt.show()
except Exception as e:
logging.error(f"An error occurred during Random Forest training/evaluation: {e}")
raise
logging.info("Script completed successfully.")
2025-02-14 11:32:08,285 - INFO - --- Random Forest (Revised) --- 2025-02-14 11:32:08,287 - INFO - Starting randomized search for Random Forest...
Fitting 5 folds for each of 30 candidates, totalling 150 fits
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/joblib/externals/loky/process_executor.py:752: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
warnings.warn(
2025-02-14 11:34:57,748 - INFO - Best parameters (RF): {'classifier__n_estimators': 200, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 0.3, 'classifier__max_depth': 20, 'classifier__class_weight': None, 'classifier__bootstrap': False}
2025-02-14 11:34:57,750 - INFO - Best cross-validation roc_auc: 0.9151
2025-02-14 11:35:09,935 - INFO - === Best Random Forest Evaluation ===
2025-02-14 11:35:09,942 - INFO - Confusion Matrix:
[[2373 556]
[ 543 3074]]
2025-02-14 11:35:09,955 - INFO -
Classification Report:
precision recall f1-score support
0.0 0.81 0.81 0.81 2929
1.0 0.85 0.85 0.85 3617
accuracy 0.83 6546
macro avg 0.83 0.83 0.83 6546
weighted avg 0.83 0.83 0.83 6546
2025-02-14 11:35:09,960 - INFO - ROC AUC: 0.9139
2025-02-14 11:35:10,129 - INFO - Script completed successfully.
In [ ]:
# Define the model file path
model_filename = os.path.expanduser('~/work/vaping_project_data/best_rf_model.joblib')
# Save the trained model
joblib.dump(best_rf, model_filename)
logging.info(f"Model saved to {model_filename}")
2025-02-14 11:35:21,792 - INFO - Model saved to /storage/home/szn5432/work/vaping_project_data/best_rf_model.joblib
In [ ]:
# Load the model from the specified path
logging.info("Loading the model...")
best_rf = joblib.load(os.path.expanduser('~/work/vaping_project_data/best_rf_model.joblib'))
logging.info("Model loaded successfull")
2025-02-14 11:35:23,382 - INFO - Loading the model... 2025-02-14 11:35:23,571 - INFO - Model loaded successfull
In [ ]:
try:
logging.info("Starting feature importance analysis...")
# Access the RandomForestClassifier from the pipeline
rf_model = best_rf.named_steps['classifier']
# Get feature importances
feature_importance = rf_model.feature_importances_
# Access the preprocessor step
preprocessor = best_rf.named_steps['preprocessor']
# Get transformed feature names
if hasattr(preprocessor, 'get_feature_names_out'):
feature_names = preprocessor.get_feature_names_out()
else:
# Fallback: Generate feature names if get_feature_names_out is not available
X_train_transformed = preprocessor.transform(X_train_with_indicators)
feature_names = [f"Feature_{idx}" for idx in range(X_train_transformed.shape[1])]
# Debugging: Print shapes and lengths
logging.info(f"Shape of X_train_with_indicators: {X_train_with_indicators.shape}")
logging.info(f"Length of feature_importance: {len(feature_importance)}")
logging.info(f"Number of feature names: {len(feature_names)}")
logging.info(f"Feature names: {feature_names}")
# Check if lengths match
if len(feature_names) != len(feature_importance):
raise ValueError(
f"Mismatch in lengths: feature_names ({len(feature_names)}) != feature_importance ({len(feature_importance)})"
)
# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({
'Feature': feature_names,
'Importance': feature_importance
})
# Aggregate importances for original features
original_feature_importance = {}
for feature, importance in zip(feature_names, feature_importance):
# Extract the original feature name (e.g., 'cat__V13_2' -> 'V13')
original_feature = feature.split('__')[1].split('_')[0]
# Sum importances for each original feature
if original_feature in original_feature_importance:
original_feature_importance[original_feature] += importance
else:
original_feature_importance[original_feature] = importance
# Create a DataFrame for aggregated importances
aggregated_importance_df = pd.DataFrame({
'Feature': list(original_feature_importance.keys()),
'Importance': list(original_feature_importance.values())
})
# Sort features by importance
aggregated_importance_df = aggregated_importance_df.sort_values(by='Importance', ascending=False)
# Plot aggregated feature importance
plt.figure(figsize=(20, 12))
sns.barplot(x='Importance', y='Feature', data=aggregated_importance_df, palette='viridis')
plt.title('Aggregated Feature Importance (Original Features)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
# Display top 20 feature importances
top_20_features = aggregated_importance_df.head(20)
print("Top 20 Feature Importances:")
print(top_20_features)
except Exception as e:
logging.error(f"An error occurred during feature importance analysis: {e}")
raise
2025-02-14 11:35:31,865 - INFO - Starting feature importance analysis... 2025-02-14 11:35:31,890 - INFO - Shape of X_train_with_indicators: (26184, 100) 2025-02-14 11:35:31,890 - INFO - Length of feature_importance: 50 2025-02-14 11:35:31,891 - INFO - Number of feature names: 50 2025-02-14 11:35:31,891 - INFO - Feature names: ['num__V2178' 'num__V2188' 'num__V2197' 'num__V2184' 'num__V2186' 'num__V2171' 'num__V2128' 'num__V2201' 'num__V2173' 'num__V2194' 'num__V2166' 'num__wave' 'num__V2176' 'num__V2175' 'num__V2177' 'num__V2116' 'num__V2125' 'num__V2182' 'num__sex' 'num__race' 'num__V2460' 'num__RESPONDENT_AGE' 'num__V2185' 'num__V2193' 'num__V2163' 'num__V49' 'num__V2108' 'num__V2101' 'num__V2180' 'num__V2164' 'num__V2191' 'num__V2195' 'num__V2155' 'num__V2196' 'num__V2189' 'num__V2179' 'num__V13' 'num__V2143' 'num__V2134' 'num__V2172' 'num__V2137' 'num__V2140' 'num__V2105' 'num__V2157' 'num__V2183' 'num__V2187' 'num__V2181' 'num__V2152' 'num__V2153' 'num__V2156'] /tmp/ipykernel_1489303/3638803247.py:63: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='Importance', y='Feature', data=aggregated_importance_df, palette='viridis')
Top 20 Feature Importances: Feature Importance 11 wave 0.504127 15 V2116 0.112886 42 V2105 0.086625 27 V2101 0.060063 26 V2108 0.022534 33 V2196 0.019504 10 V2166 0.014651 19 race 0.014635 30 V2191 0.011908 9 V2194 0.010518 47 V2152 0.009940 31 V2195 0.008595 35 V2179 0.008535 24 V2163 0.007627 29 V2164 0.007486 36 V13 0.007007 23 V2193 0.006488 8 V2173 0.006179 3 V2184 0.006108 12 V2176 0.005942
In [ ]:
# best_rf is the best estimator from your RandomizedSearchCV
tree_model = best_rf.named_steps['classifier']
# Transform the entire training set
X_train_processed_full = best_rf.named_steps['preprocessor'].transform(X_train_with_indicators)
# Convert to DataFrame for easier sampling & feature naming
feature_names = best_rf.named_steps['preprocessor'].get_feature_names_out()
X_train_processed_df = pd.DataFrame(X_train_processed_full, columns=feature_names)
# Randomly sample 5000 rows from the processed data
X_background = X_train_processed_df.sample(n=1000, random_state=42)
# Create the explainer on just the 5000 background points
explainer = shap.TreeExplainer(tree_model, data=X_background)
# If you also want to compute shap values for the same subset (typical):
shap_values = explainer.shap_values(X_background)
shap_values_class1 = shap_values[1]
99%|===================| 1989/2000 [03:08<00:01]
In [ ]:
def get_original_feature_name(encoded_name):
"""
Example parser that assumes you have names like 'cat__Gender_Male'
or 'cat__MaritalStatus_Single'.
We split on '__' to separate the transformer name from the remainder.
Then we split the remainder on '_' and assume the first chunk is the original column
(e.g., 'Gender' or 'MaritalStatus').
"""
parts = encoded_name.split("__", maxsplit=1)
if len(parts) == 2:
# e.g. 'cat', 'Gender_Male'
transformer_name, remainder = parts
# Now parse the remainder by '_' to get the original column
remainder_parts = remainder.split("_", maxsplit=1)
original_col = remainder_parts[0] # e.g. 'Gender'
return original_col
else:
# If there's no double underscore or unexpected format,
# just return the full encoded_name
return encoded_name
In [ ]:
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay
top_10_features = top_20_original_features.index[:8].tolist()
# Create a 2-row x 5-column grid of subplots
fig, axs = plt.subplots(nrows=2, ncols=4, figsize=(20, 8))
# Flatten the axes array to pass it easily if needed
axs = axs.ravel()
pdp = PartialDependenceDisplay.from_estimator(
best_rf,
X_train_with_indicators,
features=top_10_features,
feature_names=X_train_with_indicators.columns,
random_state=42,
ax=axs # pass axes array here
)
fig.suptitle("Partial Dependence Plots for Top 10 Features", fontsize=16, y=1.02)
plt.tight_layout()
Gradient Boosting Trees¶
In [ ]:
import logging
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
# Random State for reproducibility
RANDOM_STATE = 42
# Use RepeatedStratifiedKFold for more robust validation
N_SPLITS_CV = 5
N_REPEATS = 1 # Repeat the CV multiple times if desired
SCORING_METRIC = 'roc_auc'
VERBOSE = 1
logging.info("\n--- Gradient Boosting (Revised) ---")
# Build pipeline
gbc_pipeline = Pipeline([
('preprocessor', preprocessor),
('classifier', GradientBoostingClassifier(random_state=RANDOM_STATE))
])
# Expanded parameter distributions for RandomizedSearch
gbc_param_dist = {
'classifier__n_estimators': [100, 300, 500, 1000],
'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
'classifier__max_depth': [3, 5, 10, 20],
'classifier__subsample': [0.8, 0.9, 1.0], # Controls sample ratio per tree
'classifier__min_samples_split': [2, 5, 10],
'classifier__min_samples_leaf': [1, 2, 5],
'classifier__max_features': ['sqrt', 'log2', None]
}
try:
logging.info("Starting randomized search for Gradient Boosting...")
# Use RepeatedStratifiedKFold without shuffle
cv_gbc = RepeatedStratifiedKFold(
n_splits=N_SPLITS_CV,
n_repeats=N_REPEATS,
random_state=RANDOM_STATE
)
# RandomizedSearchCV to cover more combinations within reasonable compute time
gbc_random_search = RandomizedSearchCV(
estimator=gbc_pipeline,
param_distributions=gbc_param_dist,
n_iter=50, # Increase or decrease based on resources
cv=cv_gbc,
scoring=SCORING_METRIC,
n_jobs=24, # Use all available cores
random_state=RANDOM_STATE,
verbose=VERBOSE
)
# Fit the RandomizedSearchCV
gbc_random_search.fit(X_train_with_indicators, y_train)
logging.info(f"Best parameters (GBC): {gbc_random_search.best_params_}")
logging.info(f"Best cross-validation {SCORING_METRIC}: {gbc_random_search.best_score_:.4f}")
# Extract the best estimator
best_gbc = gbc_random_search.best_estimator_
except Exception as e:
logging.error(f"An error occurred during Gradient Boosting randomized search: {e}")
raise
# Evaluate the best Gradient Boosting model
try:
best_gbc.fit(X_train_with_indicators, y_train)
y_pred_gbc = best_gbc.predict(X_test_with_indicators)
y_pred_proba_gbc = best_gbc.predict_proba(X_test_with_indicators)[:, 1]
logging.info("=== Best Gradient Boosting Evaluation ===")
logging.info("Confusion Matrix:\n" + str(confusion_matrix(y_test, y_pred_gbc)))
logging.info("\nClassification Report:\n" + str(classification_report(y_test, y_pred_gbc)))
logging.info(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_gbc):.4f}")
# Plot ROC Curve
fpr_gbc, tpr_gbc, _ = roc_curve(y_test, y_pred_proba_gbc)
plt.figure(figsize=(8, 6))
plt.plot(fpr_gbc, tpr_gbc, label=f'AUC = {roc_auc_score(y_test, y_pred_proba_gbc):.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Gradient Boosting ROC Curve on Test Data')
plt.legend(loc='lower right')
plt.show()
except Exception as e:
logging.error(f"An error occurred during Gradient Boosting training/evaluation: {e}")
raise
logging.info("Script completed successfully.")
2025-02-14 14:35:57,123 - INFO - --- Gradient Boosting (Revised) --- 2025-02-14 14:35:57,125 - INFO - Starting randomized search for Gradient Boosting...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/joblib/externals/loky/process_executor.py:752: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
warnings.warn(
2025-02-14 14:45:46,214 - INFO - Best parameters (GBC): {'classifier__subsample': 0.9, 'classifier__n_estimators': 1000, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 5, 'classifier__max_features': None, 'classifier__max_depth': 5, 'classifier__learning_rate': 0.01}
2025-02-14 14:45:46,216 - INFO - Best cross-validation roc_auc: 0.9165
2025-02-14 14:46:53,258 - INFO - === Best Gradient Boosting Evaluation ===
2025-02-14 14:46:53,265 - INFO - Confusion Matrix:
[[2370 559]
[ 529 3088]]
2025-02-14 14:46:53,278 - INFO -
Classification Report:
precision recall f1-score support
0.0 0.82 0.81 0.81 2929
1.0 0.85 0.85 0.85 3617
accuracy 0.83 6546
macro avg 0.83 0.83 0.83 6546
weighted avg 0.83 0.83 0.83 6546
2025-02-14 14:46:53,282 - INFO - ROC AUC: 0.9159
2025-02-14 14:46:53,446 - INFO - Script completed successfully.
In [ ]:
# Define the model file path
model_filename = os.path.expanduser('~/work/vaping_project_data/best_gbt_model.joblib')
# Save the trained model
joblib.dump(best_gbc, model_filename)
logging.info(f"Model saved to {model_filename}")
2025-02-14 14:49:47,488 - INFO - Model saved to /storage/home/szn5432/work/vaping_project_data/best_gbt_model.joblib
In [ ]:
# Load the model (when needed)
file_path = os.path.expanduser('~/work/vaping_project_data/best_gbt_model.joblib')
loaded_gbt = joblib.load(file_path)
print("Model loaded successfully.")
Model loaded successfully.
In [ ]:
# Access the pipeline for categorical features
num_pipeline = loaded_gbt.named_steps['preprocessor'].named_transformers_['num']
# Then access the OneHotEncoder within that pipeline
ohe = num_pipeline.named_steps['scaler']
# Now get the encoded feature names
encoded_feature_names = ohe.get_feature_names_out(numeric_features)
encoded_feature_names
Out[ ]:
array(['V2178', 'V2188', 'V2197', 'V2184', 'V2186', 'V2171', 'V2128',
'V2201', 'V2173', 'V2194', 'V2166', 'wave', 'V2176', 'V2175',
'V2177', 'V2116', 'V2125', 'V2182', 'sex', 'race', 'V2460',
'RESPONDENT_AGE', 'V2185', 'V2193', 'V2163', 'V49', 'V2108',
'V2101', 'V2180', 'V2164', 'V2191', 'V2195', 'V2155', 'V2196',
'V2189', 'V2179', 'V13', 'V2143', 'V2134', 'V2172', 'V2137',
'V2140', 'V2105', 'V2157', 'V2183', 'V2187', 'V2181', 'V2152',
'V2153', 'V2156'], dtype=object)
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 1. Get the numerical pipeline
num_pipeline = loaded_gbt.named_steps['preprocessor'].named_transformers_['num']
# 2. Get the scaler (StandardScaler or whatever you named it)
scaler = num_pipeline.named_steps['scaler']
# 3. Get the feature names out (they will match one-to-one with your numeric_features)
encoded_feature_names = scaler.get_feature_names_out(numeric_features)
# 4. Get the trained classifier and its feature importances
gbt_classifier = loaded_gbt.named_steps['classifier']
importances = gbt_classifier.feature_importances_
# 5. Build a DataFrame of features vs. importances
feature_importance_df = pd.DataFrame({
'Feature': encoded_feature_names,
'Importance': importances
})
# 6. Sort by ascending order of importance
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=True)
# 7. Take top 20 features and plot
top_20 = feature_importance_df.tail(20)
plt.figure(figsize=(12, 8))
plt.barh(y=top_20['Feature'], width=top_20['Importance'])
plt.title('Top 20 Most Important Original Features')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
# 8. Print the DataFrame in descending order for readability
print("\nFeature Importance Rankings:")
print(feature_importance_df.sort_values('Importance', ascending=False))
Feature Importance Rankings:
Feature Importance
15 V2116 0.333590
11 wave 0.318799
42 V2105 0.111097
27 V2101 0.084814
19 race 0.024483
33 V2196 0.017528
26 V2108 0.011029
10 V2166 0.010013
9 V2194 0.009626
47 V2152 0.005866
30 V2191 0.005776
36 V13 0.004605
31 V2195 0.004590
29 V2164 0.004188
35 V2179 0.004186
7 V2201 0.003781
12 V2176 0.003388
2 V2197 0.003107
8 V2173 0.003022
23 V2193 0.002970
24 V2163 0.002806
18 sex 0.002401
13 V2175 0.002381
3 V2184 0.002251
0 V2178 0.002250
6 V2128 0.002124
14 V2177 0.002117
17 V2182 0.001385
46 V2181 0.001374
39 V2172 0.001293
28 V2180 0.001276
25 V49 0.001213
37 V2143 0.001157
44 V2183 0.001075
48 V2153 0.000920
21 RESPONDENT_AGE 0.000846
43 V2157 0.000752
22 V2185 0.000683
1 V2188 0.000635
40 V2137 0.000580
45 V2187 0.000557
32 V2155 0.000530
49 V2156 0.000512
4 V2186 0.000510
5 V2171 0.000476
38 V2134 0.000447
34 V2189 0.000418
16 V2125 0.000411
20 V2460 0.000104
41 V2140 0.000057
In [ ]:
# 1) Sort by descending importance and take top 10.
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
top_10_features = feature_importance_df['Feature'].head(13).tolist()
# 2) Exclude features that start with "missing_"
# or that don’t actually exist in X_train_with_indicators.
filtered_features = [
f for f in top_10_features
if not f.startswith('missing_') and f in X_train_with_indicators.columns
]
print("Top 10 original features:", top_10_features)
print("Filtered features (excluding 'missing_'): ", filtered_features)
# 3) Plot PDPs for the filtered features
n_features = len(filtered_features)
n_rows = (n_features + 1) // 2 # so we can arrange them in a grid
n_cols = 2
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(12, 4*n_rows), squeeze=False)
axes = axes.flatten()
for i, feat in enumerate(filtered_features):
ax = axes[i]
PartialDependenceDisplay.from_estimator(
estimator=loaded_gbt,
X=X_train_with_indicators,
features=[feat],
feature_names=X_train_with_indicators.columns,
kind='average', # or 'both' if you want ICE lines
ax=ax
)
ax.set_title(f"PDP for {feat}")
# If there are unused subplots, hide them
for j in range(i+1, len(axes)):
axes[j].set_visible(False)
plt.tight_layout()
plt.show()
Top 10 original features: ['V2116', 'wave', 'V2105', 'V2101', 'race', 'V2196', 'V2108', 'V2166', 'V2194', 'V2152', 'V2191', 'V13', 'V2195'] Filtered features (excluding 'missing_'): ['V2116', 'wave', 'V2105', 'V2101', 'race', 'V2196', 'V2108', 'V2166', 'V2194', 'V2152', 'V2191', 'V13', 'V2195']
In [ ]:
X_train_transformed = preprocessor.transform(X_train_with_indicators)
print("Shape of X_train_transformed:", X_train_transformed.shape)
print("Dtypes (if it is a NumPy array):", X_train_transformed.dtype)
# If X_train_transformed is a Pandas DataFrame:
if hasattr(X_train_transformed, 'dtypes'):
print(X_train_transformed.dtypes)
X_train_with_indicators.info() # or X_train_with_indicators.isna().sum() if DataFrame
Shape of X_train_transformed: (26184, 50) Dtypes (if it is a NumPy array): float64 <class 'pandas.core.frame.DataFrame'> RangeIndex: 26184 entries, 0 to 26183 Data columns (total 100 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 V2178 25473 non-null float64 1 V2188 25162 non-null float64 2 V2197 24821 non-null float64 3 V2184 24562 non-null float64 4 V2186 25162 non-null float64 5 V2171 25690 non-null float64 6 V2128 25948 non-null float64 7 V2201 24792 non-null float64 8 V2173 25486 non-null float64 9 V2194 25142 non-null float64 10 V2166 25505 non-null float64 11 wave 26184 non-null int64 12 V2176 24467 non-null float64 13 V2175 24912 non-null float64 14 V2177 24843 non-null float64 15 V2116 25637 non-null float64 16 V2125 25846 non-null float64 17 V2182 24349 non-null float64 18 sex 26184 non-null float64 19 race 26184 non-null float64 20 V2460 25156 non-null float64 21 RESPONDENT_AGE 26137 non-null float64 22 V2185 25162 non-null float64 23 V2193 23619 non-null float64 24 V2163 25864 non-null float64 25 V49 26007 non-null float64 26 V2108 25138 non-null float64 27 V2101 25965 non-null float64 28 V2180 24492 non-null float64 29 V2164 25852 non-null float64 30 V2191 25206 non-null float64 31 V2195 25038 non-null float64 32 V2155 25956 non-null float64 33 V2196 25035 non-null float64 34 V2189 25162 non-null float64 35 V2179 25454 non-null float64 36 V13 26184 non-null int64 37 V2143 25996 non-null float64 38 V2134 25961 non-null float64 39 V2172 25550 non-null float64 40 V2137 25919 non-null float64 41 V2140 26040 non-null float64 42 V2105 25365 non-null float64 43 V2157 25956 non-null float64 44 V2183 24976 non-null float64 45 V2187 25162 non-null float64 46 V2181 24370 non-null float64 47 V2152 26108 non-null float64 48 V2153 25963 non-null float64 49 V2156 25956 non-null float64 50 missing_V2178 26184 non-null bool 51 missing_V2188 26184 non-null bool 52 missing_V2197 26184 non-null bool 53 missing_V2184 26184 non-null bool 54 missing_V2186 26184 non-null bool 55 missing_V2171 26184 non-null bool 56 missing_V2128 26184 non-null bool 57 missing_V2201 26184 non-null bool 58 missing_V2173 26184 non-null bool 59 missing_V2194 26184 non-null bool 60 missing_V2166 26184 non-null bool 61 missing_wave 26184 non-null bool 62 missing_V2176 26184 non-null bool 63 missing_V2175 26184 non-null bool 64 missing_V2177 26184 non-null bool 65 missing_V2116 26184 non-null bool 66 missing_V2125 26184 non-null bool 67 missing_V2182 26184 non-null bool 68 missing_sex 26184 non-null bool 69 missing_race 26184 non-null bool 70 missing_V2460 26184 non-null bool 71 missing_RESPONDENT_AGE 26184 non-null bool 72 missing_V2185 26184 non-null bool 73 missing_V2193 26184 non-null bool 74 missing_V2163 26184 non-null bool 75 missing_V49 26184 non-null bool 76 missing_V2108 26184 non-null bool 77 missing_V2101 26184 non-null bool 78 missing_V2180 26184 non-null bool 79 missing_V2164 26184 non-null bool 80 missing_V2191 26184 non-null bool 81 missing_V2195 26184 non-null bool 82 missing_V2155 26184 non-null bool 83 missing_V2196 26184 non-null bool 84 missing_V2189 26184 non-null bool 85 missing_V2179 26184 non-null bool 86 missing_V13 26184 non-null bool 87 missing_V2143 26184 non-null bool 88 missing_V2134 26184 non-null bool 89 missing_V2172 26184 non-null bool 90 missing_V2137 26184 non-null bool 91 missing_V2140 26184 non-null bool 92 missing_V2105 26184 non-null bool 93 missing_V2157 26184 non-null bool 94 missing_V2183 26184 non-null bool 95 missing_V2187 26184 non-null bool 96 missing_V2181 26184 non-null bool 97 missing_V2152 26184 non-null bool 98 missing_V2153 26184 non-null bool 99 missing_V2156 26184 non-null bool dtypes: bool(50), float64(48), int64(2) memory usage: 11.2 MB
In [ ]:
##### SHAP Feature Importance ####
# Extract the GradientBoostingClassifier
gbt_models = loaded_gbt.named_steps['classifier']
# Get preprocessed features
X_preprocessed = loaded_gbt.named_steps['preprocessor'].transform(X_test_with_indicators)
# Create SHAP explainer
explainer = shap.TreeExplainer(gbt_models)
# Calculate SHAP values
shap_values = explainer.shap_values(X_preprocessed)
# Get feature names after preprocessing
feature_names = loaded_gbt.named_steps['preprocessor'].get_feature_names_out()
In [ ]:
# Create visualizations
# Summary plot
shap.summary_plot(shap_values, X_preprocessed, feature_names=feature_names)
# Bar plot of feature importance
shap.summary_plot(shap_values, X_preprocessed, feature_names=feature_names, plot_type='bar')
In [ ]:
# 1. Aggregate SHAP values by base feature
feature_importances = {}
for i, col in enumerate(feature_names):
base_feature = col.replace('num__', '') # Extract base feature name
if base_feature not in feature_importances:
feature_importances[base_feature] = []
feature_importances[base_feature].extend(np.abs(shap_values[:, i]))
# 2. Calculate mean absolute SHAP value for each base feature
aggregated_importances = {
feature: np.mean(values) for feature, values in feature_importances.items()
}
# 3. Sort features by importance
sorted_importances = sorted(
aggregated_importances.items(), key=lambda item: item[1], reverse=True
)
# 4. Create a DataFrame for plotting
importance_df = pd.DataFrame(sorted_importances, columns=['Feature', 'Importance'])
# Filter to show only the top 20 features
top_20_importance_df = importance_df.head(20)
In [ ]:
# 5. Create the bar plot
plt.figure(figsize=(12, 8)) # Adjust size as needed
plt.barh(top_20_importance_df['Feature'], top_20_importance_df['Importance'], color='dodgerblue')
plt.xlabel('Mean Absolute SHAP Value')
plt.ylabel('Feature')
plt.title('Top 20 Features Ranked by Mean Absolute SHAP Value')
plt.gca().invert_yaxis() # Most important feature on top
plt.tight_layout()
plt.show()
In [ ]:
# For individual predictions (e.g., first sample)
shap.initjs() # Initialize JavaScript visualization
single_sample_idx = 0
shap.force_plot(explainer.expected_value[1] if isinstance(shap_values, list) else explainer.expected_value,
shap_values[single_sample_idx] if isinstance(shap_values, list) else shap_values[single_sample_idx,:],
X_preprocessed[single_sample_idx],
feature_names=feature_names)
Out[ ]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [ ]:
###### Demonstration of SHAP feature importance for one individual case ###########
# For individual predictions (e.g., first sample)
single_sample_idx = 0
single_sample_shap_values = shap_values[single_sample_idx] if isinstance(shap_values, list) else shap_values[single_sample_idx, :]
# Aggregate SHAP values by base feature for the single sample
feature_importances = {}
for i, feature_name in enumerate(feature_names):
base_feature = col.replace('num__', '')
# Extract base feature name
shap_value = single_sample_shap_values[i]
feature_importances[base_feature] = feature_importances.get(base_feature, 0) + abs(shap_value)
# Sort features by importance
sorted_importances = sorted(feature_importances.items(), key=lambda item: item[1], reverse=True)
# Print the ranked feature importance for the single prediction
print(f"Overall Feature Importance for Sample {single_sample_idx}:")
for feature, importance in sorted_importances:
print(f"{feature}: {importance:.4f}")
Overall Feature Importance for Sample 0: V2156: 3.5576
In [ ]:
# SHAP feature interaction
# Calculate SHAP interaction values
shap_interaction_values = explainer.shap_interaction_values(X_preprocessed)
In [ ]:
import numpy as np
import pandas as pd
def aggregate_shap_interactions(shap_interaction_values, feature_names, get_base_feature):
"""
Aggregates pairwise SHAP interaction values back to their original (pre–one-hot) features.
Parameters
----------
shap_interaction_values : np.ndarray
SHAP interaction values of shape [n_samples, n_features, n_features].
feature_names : list of str
The one-hot-encoded feature names corresponding to shap_interaction_values.
get_base_feature : callable
A function that takes a one-hot-encoded feature name and returns the base/original feature name.
Returns
-------
pd.DataFrame
DataFrame with ["Feature1", "Feature2", "InteractionValue", "AbsInteraction"]
sorted in descending order of AbsInteraction.
"""
# 1. Aggregate across samples (e.g., mean absolute interactions)
interaction_matrix = np.mean(np.abs(shap_interaction_values), axis=0)
# 2. Map each OHE feature to a base feature
base_feature_names = [get_base_feature(n) for n in feature_names]
unique_base_features = list(set(base_feature_names))
# 3. Build a structure to accumulate aggregated pairwise interactions
aggregated_interactions = {
bf_i: {bf_j: 0.0 for bf_j in unique_base_features}
for bf_i in unique_base_features
}
n_features = len(feature_names)
for i in range(n_features):
for j in range(i+1, n_features): # i+1 => no diagonal, no duplicates
bf_i = base_feature_names[i]
bf_j = base_feature_names[j]
aggregated_interactions[bf_i][bf_j] += interaction_matrix[i, j]
# 4. Convert to DataFrame
data_records = []
for bf_i in unique_base_features:
for bf_j in unique_base_features:
# If you want to keep only i <= j, add a condition to avoid duplicates
interaction_val = aggregated_interactions[bf_i][bf_j]
data_records.append((bf_i, bf_j, interaction_val))
df_interactions = pd.DataFrame(data_records, columns=["Feature1", "Feature2", "InteractionValue"])
df_interactions["AbsInteraction"] = df_interactions["InteractionValue"].abs()
# Sort descending by absolute interaction
df_interactions.sort_values("AbsInteraction", ascending=False, inplace=True)
df_interactions.reset_index(drop=True, inplace=True)
df_interactions_no_diagonal = df_interactions[df_interactions['Feature1'] != df_interactions['Feature2']]
return df_interactions_no_diagonal
# Example usage:
def simple_get_base_feature(name):
# If it has the "num__" prefix, strip it off
if name.startswith("num__"):
name = name[len("num__"):] # "V2178"
# If it has the "cat__" prefix, strip that as well
if name.startswith("cat__"):
name = name[len("cat__"):]
# Now 'name' might look like "V2178"
# Just return it as the base feature
return name
df_agg_interactions = aggregate_shap_interactions(
shap_interaction_values=shap_interaction_values,
feature_names=feature_names,
get_base_feature=simple_get_base_feature
)
# Print the top 20 interactions
print(df_agg_interactions.head(20))
# --- Pick Top 20 Interactions ---
df_top_20 = df_agg_interactions.head(20).copy()
# Create a convenient label for each pair
df_top_20["Pair"] = df_top_20["Feature1"] + " & " + df_top_20["Feature2"]
# --- Plot ---
plt.figure(figsize=(10, 6))
sns.barplot(
data=df_top_20,
y="Pair",
x="AbsInteraction",
color="royalblue"
)
plt.title("Top 20 Pairwise Feature Interactions by Absolute SHAP Value")
plt.xlabel("Absolute SHAP Interaction Value")
plt.ylabel("Feature Pair")
plt.tight_layout()
plt.show()
Feature1 Feature2 InteractionValue AbsInteraction 0 wave V2116 0.381856 0.381856 1 wave V2105 0.305814 0.305814 2 wave V2101 0.181126 0.181126 3 wave V2196 0.057863 0.057863 4 wave race 0.055804 0.055804 5 V2116 V2105 0.049386 0.049386 6 V2194 wave 0.040912 0.040912 7 wave V2108 0.035271 0.035271 8 V2166 wave 0.033239 0.033239 9 V2116 V2101 0.033163 0.033163 10 wave V2191 0.028146 0.028146 11 wave sex 0.022875 0.022875 12 wave V2195 0.022266 0.022266 13 wave V2176 0.020286 0.020286 14 V2116 race 0.017869 0.017869 15 V2101 V2105 0.017857 0.017857 16 V2201 wave 0.017011 0.017011 17 V2196 V2105 0.016886 0.016886 18 wave V2152 0.015859 0.015859 19 wave V13 0.015846 0.015846
In [ ]:
# Interaction of the top 2 features
top_features_indices = np.argsort(np.abs(shap_values).mean(0))[-2:] # Get indices of top 2 features
feature1_idx = top_features_indices[0]
feature2_idx = top_features_indices[1]
feature1_name = feature_names[feature1_idx]
feature2_name = feature_names[feature2_idx]
In [ ]:
# Visualize the interaction between the top two features
shap.dependence_plot(
feature1_idx,
shap_values,
X_preprocessed,
feature_names=feature_names,
interaction_index=feature2_idx,
)
shap.dependence_plot(
feature2_idx,
shap_values,
X_preprocessed,
feature_names=feature_names,
interaction_index=feature1_idx,
)
In [ ]:
# 1. Group feature indices by base feature (removing "num__" prefix).
base_feature_indices = {}
for i, feature_name in enumerate(feature_names):
# Remove "num__" so each feature remains distinct
base_feature = feature_name.replace("num__", "")
if base_feature not in base_feature_indices:
base_feature_indices[base_feature] = []
base_feature_indices[base_feature].append(i)
base_feature_list = list(base_feature_indices.keys())
# 2. Calculate overall interaction importance for each pair of base features
# without duplicating reversed pairs (bf1,bf2) vs (bf2,bf1).
base_feature_interaction_importance = {}
for bf1_idx in range(len(base_feature_list)):
for bf2_idx in range(bf1_idx + 1, len(base_feature_list)):
bf1 = base_feature_list[bf1_idx]
bf2 = base_feature_list[bf2_idx]
# Sum up the pairwise interactions between *all* sub-indices of bf1 and bf2.
interaction_sum = 0.0
for i in base_feature_indices[bf1]:
for j in base_feature_indices[bf2]:
if isinstance(shap_interaction_values, list):
# e.g., for multiclass or ensemble, focusing on shap_interaction_values[0]
interaction_sum += shap_interaction_values[0][0, i, j]
# If you want both i->j and j->i, add shap_interaction_values[0][0, j, i]
# but usually shap_interaction_values[i,j] == shap_interaction_values[j,i].
else:
interaction_sum += shap_interaction_values[0, i, j]
# Same note here if you want both directions.
# Use absolute value as "importance"
pair_key = (bf1, bf2) # We already enforce bf1_idx < bf2_idx
base_feature_interaction_importance[pair_key] = abs(interaction_sum)
# 3. Sort base feature interactions by importance
sorted_base_feature_interactions = sorted(
base_feature_interaction_importance.items(),
key=lambda item: item[1],
reverse=True
)
# 4. Print the top 50 ranked base feature interactions (no (bf, bf), no reversed duplicates)
print("Top 50 Overall Base Feature Interaction Importance (Skipping self-interactions & duplicates):")
for (bf1, bf2), importance in sorted_base_feature_interactions[:50]:
print(f"Interaction between {bf1} and {bf2}: {importance:.4f}")
Top 50 Overall Base Feature Interaction Importance (Skipping self-interactions & duplicates): Interaction between wave and V2105: 0.4028 Interaction between wave and V2116: 0.3704 Interaction between wave and V2101: 0.1144 Interaction between V2201 and wave: 0.1128 Interaction between V2166 and wave: 0.0705 Interaction between wave and V2196: 0.0540 Interaction between wave and race: 0.0299 Interaction between V2184 and wave: 0.0246 Interaction between V2116 and race: 0.0215 Interaction between wave and V2195: 0.0202 Interaction between wave and sex: 0.0202 Interaction between wave and V2176: 0.0186 Interaction between V2194 and wave: 0.0180 Interaction between wave and V2108: 0.0171 Interaction between wave and V2191: 0.0163 Interaction between V2166 and V2116: 0.0128 Interaction between race and V2105: 0.0127 Interaction between V2184 and V2175: 0.0121 Interaction between V2175 and sex: 0.0117 Interaction between wave and V2152: 0.0094 Interaction between V2116 and V2105: 0.0090 Interaction between V2196 and V2105: 0.0089 Interaction between V2201 and sex: 0.0083 Interaction between V2201 and V2105: 0.0081 Interaction between V2166 and V2157: 0.0078 Interaction between V2201 and V2196: 0.0073 Interaction between sex and V2105: 0.0072 Interaction between V2184 and V2157: 0.0072 Interaction between V2175 and V2191: 0.0070 Interaction between V2201 and V2166: 0.0068 Interaction between V2194 and V2105: 0.0068 Interaction between V2116 and V2101: 0.0067 Interaction between wave and V2177: 0.0067 Interaction between V2166 and V2105: 0.0067 Interaction between V2194 and V2166: 0.0064 Interaction between V2201 and V2101: 0.0061 Interaction between wave and V13: 0.0061 Interaction between V2116 and V2152: 0.0058 Interaction between V2197 and wave: 0.0058 Interaction between V2175 and V2105: 0.0056 Interaction between V2116 and sex: 0.0056 Interaction between wave and V2193: 0.0056 Interaction between V2194 and V2175: 0.0054 Interaction between V2176 and sex: 0.0050 Interaction between V2195 and V2105: 0.0048 Interaction between V2166 and sex: 0.0048 Interaction between sex and V2152: 0.0047 Interaction between V2173 and wave: 0.0047 Interaction between V2176 and V2105: 0.0045 Interaction between V2166 and V2152: 0.0043
In [ ]:
# 3. Create a matrix for the heatmap
num_base_features = len(base_feature_names)
interaction_matrix = np.zeros((num_base_features, num_base_features))
for i, bf1 in enumerate(base_feature_names):
for j, bf2 in enumerate(base_feature_names):
# Use the sorted tuple for lookup
key = tuple(sorted((bf1, bf2)))
if key in base_feature_interaction_importance:
interaction_matrix[i, j] = base_feature_interaction_importance[key]
#else:
# print(f"Warning: Interaction not found for {bf1}, {bf2}") # Optional debugging
# 4. Visualize the aggregated interaction using a heatmap
plt.figure(figsize=(30, 28))
plt.imshow(interaction_matrix, cmap='coolwarm', aspect='auto')
plt.colorbar(label='Aggregated Interaction Strength')
plt.xticks(range(num_base_features), base_feature_names, rotation=45, ha="right")
plt.yticks(range(num_base_features), base_feature_names)
plt.title('Aggregated SHAP Interaction Between Base Features')
plt.tight_layout()
plt.show()
In [ ]:
# Interaction Summary Plot (for overall interaction strengths)
shap.summary_plot(shap_interaction_values, X_preprocessed, feature_names=feature_names)
Histogram-based Gradient Boost Classifier¶
In [ ]:
import logging
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.preprocessing import FunctionTransformer
# Random State for reproducibility
RANDOM_STATE = 42
# Use RepeatedStratifiedKFold for more robust validation
N_SPLITS_CV = 5
N_REPEATS = 2 # Repeat the CV multiple times if desired
SCORING_METRIC = 'roc_auc'
VERBOSE = 1
logging.info("\n--- Gradient Boosting (Revised) ---")
# Define the transformer:
from sklearn.preprocessing import FunctionTransformer
def to_dense_func(X):
"""Convert sparse matrices to dense arrays (if needed)."""
return X.toarray() if hasattr(X, 'toarray') else X
to_dense = FunctionTransformer(to_dense_func)
# Build pipeline
gbc_pipeline = Pipeline([
('preprocessor', preprocessor),
('to_dense', to_dense),
('classifier', HistGradientBoostingClassifier(random_state=RANDOM_STATE))
])
# Expanded parameter distributions for RandomizedSearch
# Optimized parameter grid for 24-core/100GB RAM
param_grid = {
'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2], # Wider range including very low rates
'classifier__max_depth': [6, 12, 18], # Deeper trees with more variation
'classifier__min_samples_leaf': [20, 50, 100], # More granular leaf sizes
'classifier__l2_regularization': [0.0, 0.1, 0.5, 1.0],# Stronger regularization options
'classifier__max_bins': [255], # Keep max bins for accuracy
'classifier__max_leaf_nodes': [64, 128, 256], # Control tree complexity
'classifier__max_iter': [2000], # Let early stopping handle actual iterations
}
try:
logging.info("Starting randomized search for Gradient Boosting...")
# Use RepeatedStratifiedKFold without shuffle
cv_gbc = RepeatedStratifiedKFold(
n_splits=N_SPLITS_CV,
n_repeats=N_REPEATS,
random_state=RANDOM_STATE
)
# RandomizedSearchCV to cover more combinations within reasonable compute time
gbc_random_search = RandomizedSearchCV(
estimator=gbc_pipeline,
param_distributions=param_grid,
n_iter=50, # Increase or decrease based on resources
cv=cv_gbc,
scoring=SCORING_METRIC,
n_jobs=24, # Use all available cores
random_state=RANDOM_STATE,
verbose=VERBOSE
)
# Fit the RandomizedSearchCV
gbc_random_search.fit(X_train_with_indicators, y_train)
logging.info(f"Best parameters (GBC): {gbc_random_search.best_params_}")
logging.info(f"Best cross-validation {SCORING_METRIC}: {gbc_random_search.best_score_:.4f}")
# Extract the best estimator
best_gbc = gbc_random_search.best_estimator_
except Exception as e:
logging.error(f"An error occurred during Gradient Boosting randomized search: {e}")
raise
# Evaluate the best Gradient Boosting model
try:
best_gbc.fit(X_train_with_indicators, y_train)
y_pred_gbc = best_gbc.predict(X_test_with_indicators)
y_pred_proba_gbc = best_gbc.predict_proba(X_test_with_indicators)[:, 1]
logging.info("=== Best Gradient Boosting Evaluation ===")
logging.info("Confusion Matrix:\n" + str(confusion_matrix(y_test, y_pred_gbc)))
logging.info("\nClassification Report:\n" + str(classification_report(y_test, y_pred_gbc)))
logging.info(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_gbc):.4f}")
# Plot ROC Curve
fpr_gbc, tpr_gbc, _ = roc_curve(y_test, y_pred_proba_gbc)
plt.figure(figsize=(8, 6))
plt.plot(fpr_gbc, tpr_gbc, label=f'AUC = {roc_auc_score(y_test, y_pred_proba_gbc):.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Gradient Boosting ROC Curve on Test Data')
plt.legend(loc='lower right')
plt.show()
except Exception as e:
logging.error(f"An error occurred during Gradient Boosting training/evaluation: {e}")
raise
logging.info("Script completed successfully.")
2025-02-14 17:06:17,375 - INFO - --- Gradient Boosting (Revised) --- 2025-02-14 17:06:17,376 - INFO - Starting randomized search for Gradient Boosting...
Fitting 10 folds for each of 50 candidates, totalling 500 fits
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
2025-02-14 17:15:26,165 - INFO - Best parameters (GBC): {'classifier__min_samples_leaf': 50, 'classifier__max_leaf_nodes': 64, 'classifier__max_iter': 2000, 'classifier__max_depth': 18, 'classifier__max_bins': 255, 'classifier__learning_rate': 0.05, 'classifier__l2_regularization': 0.1}
2025-02-14 17:15:26,166 - INFO - Best cross-validation roc_auc: 0.9098
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
2025-02-14 17:15:31,141 - INFO - === Best Gradient Boosting Evaluation ===
2025-02-14 17:15:31,148 - INFO - Confusion Matrix:
[[2369 560]
[ 571 3046]]
2025-02-14 17:15:31,161 - INFO -
Classification Report:
precision recall f1-score support
0.0 0.81 0.81 0.81 2929
1.0 0.84 0.84 0.84 3617
accuracy 0.83 6546
macro avg 0.83 0.83 0.83 6546
weighted avg 0.83 0.83 0.83 6546
2025-02-14 17:15:31,165 - INFO - ROC AUC: 0.9091
2025-02-14 17:15:31,421 - INFO - Script completed successfully.
In [ ]:
# Define the model file path
model_filename = os.path.expanduser('~/work/vaping_project_data/best_hgbt_model.joblib')
# Save the trained model
joblib.dump(best_gbc, model_filename)
logging.info(f"Model saved to {model_filename}")
2025-02-14 17:16:56,234 - INFO - Model saved to /storage/home/szn5432/work/vaping_project_data/best_hgbt_model.joblib
In [ ]:
# Load the model (when needed)
file_path = os.path.expanduser('~/work/vaping_project_data/best_hgbt_model.joblib')
loaded_hgbt = joblib.load(file_path)
print("Model loaded successfully.")
Model loaded successfully.
In [ ]:
from sklearn.inspection import permutation_importance
# Calculate permutation importance on RAW DATA (let pipeline handle preprocessing)
result = permutation_importance(
loaded_hgbt, # This is your full pipeline
X_test_with_indicators, # Raw data with missing indicators
y_test,
n_repeats=5,
random_state=RANDOM_STATE,
n_jobs=CPU_COUNT
)
# Get feature names from the raw data (including missing indicators)
feature_names = X_test_with_indicators.columns.tolist()
# Create importance DataFrame
perm_importance = pd.DataFrame({
'Feature': feature_names,
'Importance': result.importances_mean
}).sort_values('Importance', ascending=False)
# Plot top 20
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=perm_importance.head(20))
plt.title("Top 20 Features by Permutation Importance (Raw Features)")
plt.show()
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn( /storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn(
In [ ]:
# Create a table for top 20 feature importances
top_20_features = perm_importance.head(20)
# Display the table
print("Top 20 Feature Importances:")
display(top_20_features.style.background_gradient(cmap='Blues', subset=['Importance']))
Top 20 Feature Importances:
| Feature | Importance | |
|---|---|---|
| 11 | wave | 0.315643 |
| 15 | V2116 | 0.065383 |
| 27 | V2101 | 0.031103 |
| 42 | V2105 | 0.025023 |
| 19 | race | 0.015826 |
| 26 | V2108 | 0.004980 |
| 7 | V2201 | 0.001833 |
| 17 | V2182 | 0.001253 |
| 14 | V2177 | 0.001130 |
| 12 | V2176 | 0.001130 |
| 18 | sex | 0.000978 |
| 33 | V2196 | 0.000886 |
| 24 | V2163 | 0.000886 |
| 2 | V2197 | 0.000825 |
| 25 | V49 | 0.000794 |
| 28 | V2180 | 0.000764 |
| 6 | V2128 | 0.000733 |
| 9 | V2194 | 0.000733 |
| 10 | V2166 | 0.000733 |
| 49 | V2156 | 0.000703 |
In [ ]:
##### SHAP Feature Importance ####
# Extract the GradientBoostingClassifier
hgbt_models = loaded_hgbt.named_steps['classifier']
# Get preprocessed features
X_preprocessed = loaded_hgbt.named_steps['preprocessor'].transform(X_test_with_indicators)
# Convert sparse matrix to DataFrame
X_preprocessed = pd.DataFrame(X_preprocessed.toarray())
# Create SHAP explainer
explainer = shap.TreeExplainer(hgbt_models)
# Calculate SHAP values
shap_values = explainer.shap_values(X_preprocessed)
# Get feature names after preprocessing
feature_names = loaded_hgbt.named_steps['preprocessor'].get_feature_names_out()
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros warnings.warn(
In [ ]:
# 1. Aggregate SHAP values by base feature
feature_importances = {}
for i, col in enumerate(feature_names):
base_feature = '_'.join(col.split('_')[:-1]) # Extract base feature name
if base_feature not in feature_importances:
feature_importances[base_feature] = []
feature_importances[base_feature].extend(np.abs(shap_values[:, i]))
# 2. Calculate mean absolute SHAP value for each base feature
aggregated_importances = {
feature: np.mean(values) for feature, values in feature_importances.items()
}
# 3. Sort features by importance
sorted_importances = sorted(
aggregated_importances.items(), key=lambda item: item[1], reverse=True
)
# 4. Create a DataFrame for plotting
importance_df = pd.DataFrame(sorted_importances, columns=['Feature', 'Importance'])
# Filter to show only the top 20 features
top_20_importance_df = importance_df.head(20)
# 5. Create the bar plot
plt.figure(figsize=(12, 8)) # Adjust size as needed
plt.barh(top_20_importance_df['Feature'], top_20_importance_df['Importance'], color='dodgerblue')
plt.xlabel('Mean Absolute SHAP Value')
plt.ylabel('Feature')
plt.title('Top 20 Features Ranked by Mean Absolute SHAP Value')
plt.gca().invert_yaxis() # Most important feature on top
plt.tight_layout()
plt.show()
In [ ]:
import numpy as np
# Suppose your X_preprocessed is shape (n_samples, n_features).
# Randomly sample e.g. 300 rows:
sample_size = 300
if X_preprocessed.shape[0] > sample_size:
rnd_idx = np.random.choice(X_preprocessed.shape[0], sample_size, replace=False)
X_sampled = X_preprocessed.iloc[rnd_idx]
else:
X_sampled = X_preprocessed
# Now compute interaction values on this smaller subset
shap_interaction_values = explainer.shap_interaction_values(X_sampled)
In [ ]:
import numpy as np
import pandas as pd
def aggregate_shap_interactions(shap_interaction_values, feature_names, get_base_feature):
"""
Aggregates pairwise SHAP interaction values back to their original (pre–one-hot) features.
Parameters
----------
shap_interaction_values : np.ndarray
SHAP interaction values of shape [n_samples, n_features, n_features].
feature_names : list of str
The one-hot-encoded feature names corresponding to shap_interaction_values.
get_base_feature : callable
A function that takes a one-hot-encoded feature name and returns the base/original feature name.
Returns
-------
pd.DataFrame
DataFrame with ["Feature1", "Feature2", "InteractionValue", "AbsInteraction"]
sorted in descending order of AbsInteraction.
"""
# 1. Aggregate across samples (e.g., mean absolute interactions)
interaction_matrix = np.mean(np.abs(shap_interaction_values), axis=0)
# 2. Map each OHE feature to a base feature
base_feature_names = [get_base_feature(n) for n in feature_names]
unique_base_features = list(set(base_feature_names))
# 3. Build a structure to accumulate aggregated pairwise interactions
aggregated_interactions = {
bf_i: {bf_j: 0.0 for bf_j in unique_base_features}
for bf_i in unique_base_features
}
n_features = len(feature_names)
for i in range(n_features):
for j in range(i+1, n_features): # i+1 => no diagonal, no duplicates
bf_i = base_feature_names[i]
bf_j = base_feature_names[j]
aggregated_interactions[bf_i][bf_j] += interaction_matrix[i, j]
# 4. Convert to DataFrame
data_records = []
for bf_i in unique_base_features:
for bf_j in unique_base_features:
# If you want to keep only i <= j, add a condition to avoid duplicates
interaction_val = aggregated_interactions[bf_i][bf_j]
data_records.append((bf_i, bf_j, interaction_val))
df_interactions = pd.DataFrame(data_records, columns=["Feature1", "Feature2", "InteractionValue"])
df_interactions["AbsInteraction"] = df_interactions["InteractionValue"].abs()
# Sort descending by absolute interaction
df_interactions.sort_values("AbsInteraction", ascending=False, inplace=True)
df_interactions.reset_index(drop=True, inplace=True)
df_interactions_no_diagonal = df_interactions[df_interactions['Feature1'] != df_interactions['Feature2']]
return df_interactions_no_diagonal
# Example usage:
def simple_get_base_feature(name):
# Remove the cat__ prefix if present
if name.startswith("cat__"):
name = name[len("cat__"):]
# Then split on the first underscore only
return name.split("_", 1)[0]
df_agg_interactions = aggregate_shap_interactions(
shap_interaction_values=shap_interaction_values,
feature_names=feature_names,
get_base_feature=simple_get_base_feature
)
# Print the top 20 interactions
print(df_agg_interactions.head(20))
# --- Pick Top 20 Interactions ---
df_top_20 = df_agg_interactions.head(20).copy()
# Create a convenient label for each pair
df_top_20["Pair"] = df_top_20["Feature1"] + " & " + df_top_20["Feature2"]
# --- Plot ---
plt.figure(figsize=(10, 6))
sns.barplot(
data=df_top_20,
y="Pair",
x="AbsInteraction",
color="royalblue"
)
plt.title("Top 20 Pairwise Feature Interactions by Absolute SHAP Value")
plt.xlabel("Absolute SHAP Interaction Value")
plt.ylabel("Feature Pair")
plt.tight_layout()
plt.show()
Feature1 Feature2 InteractionValue AbsInteraction 1 wave V2116 0.512344 0.512344 2 wave V2105 0.453857 0.453857 3 wave V2101 0.280974 0.280974 4 wave race 0.173988 0.173988 5 wave V2196 0.081804 0.081804 6 wave V2108 0.077313 0.077313 7 V2166 wave 0.053517 0.053517 8 V2194 wave 0.050455 0.050455 9 wave sex 0.049956 0.049956 10 wave V13 0.041606 0.041606 11 V2116 V2105 0.038339 0.038339 12 V2116 V2101 0.035215 0.035215 13 wave V2191 0.033705 0.033705 14 wave V2152 0.033623 0.033623 15 wave V2179 0.033035 0.033035 17 V2201 wave 0.031867 0.031867 18 V2116 race 0.030408 0.030408 19 wave V2164 0.026907 0.026907 20 wave V2195 0.026834 0.026834 21 V2173 wave 0.025816 0.025816
XGBOOST¶
In [ ]:
import os
import logging
import joblib
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import FunctionTransformer
import numpy as np
# ======================
# 1. Expanded Hyperparameter Grid
# ======================
param_dist = {
# Increase n_estimators up to 1000 (or more)
'classifier__n_estimators': [100, 300, 500, 800, 1000],
# Smaller learning rates for finer updates
'classifier__learning_rate': [0.01, 0.02, 0.03, 0.05, 0.1],
# Broader range for max_depth
'classifier__max_depth': [3, 5, 7, 9, 12],
# Tweak min_child_weight to control complexity
'classifier__min_child_weight': [1, 3, 5, 7, 10],
# Keep or enlarge subsample
'classifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
# Potentially try colsample_bytree > 1.0 (uncommon, but possible)
'classifier__colsample_bytree': [0.6, 0.8, 1.0, 1.2],
# Adjust gamma (larger => more conservative splits)
'classifier__gamma': [0, 0.1, 0.3, 0.6, 1.0],
# Expand regularization range
'classifier__reg_alpha': [0, 0.1, 0.2, 0.5, 1.0, 2.0],
'classifier__reg_lambda': [0.5, 1.0, 1.5, 2.0, 3.0, 4.0]
}
# ============================
# 2. OPTIONAL FEATURE ENGINEERING
# ============================
# Example: add a custom transformer to create domain-specific or interaction features
# (Here, we just pass data through, but you'd modify 'feature_engineering' to transform the input DataFrame.)
def feature_engineering(X):
"""
Placeholder function where you can create domain-specific,
polynomial, or ratio features. This must return a DataFrame or array.
"""
# Example: create a simple ratio of two columns (if they exist)
# if 'colA' in X and 'colB' in X:
# X['ratio_A_B'] = X['colA'] / (X['colB'] + 1e-9)
return X
feature_eng_transformer = FunctionTransformer(feature_engineering, validate=False)
# ============================
# 3. Build Pipeline
# ============================
# We assume:
# - You already have 'preprocessor' for OneHotEncoder, etc.
# - 'X_train_with_indicators', 'y_train', 'X_test_with_indicators', 'y_test'
# - 'train_evaluate_model(...)' function
# - Constants: RANDOM_STATE, SCORING_METRIC, N_SPLITS_CV
xgb_clf = xgb.XGBClassifier(
eval_metric='logloss',
random_state=RANDOM_STATE
)
# Here we insert a feature engineering step *before* the preprocessor:
xgb_pipeline = Pipeline([
('feature_engineering', feature_eng_transformer),
('preprocessor', preprocessor),
('classifier', xgb_clf)
])
# =========================
# 4. (Optional) Early Stopping
# =========================
# In scikit-learn's RandomizedSearchCV, providing early_stopping_rounds is non-trivial because
# you need a separate validation set or a custom approach inside each CV fold.
# If you wish to do a simple holdout for early stopping, you'd do something like:
#
# fit_params = {
# 'classifier__early_stopping_rounds': 30,
# 'classifier__eval_metric': 'logloss',
# 'classifier__eval_set': [(X_val, y_val)], # separate validation set
# }
#
# Then pass fit_params=fit_params to random_search.fit(...).
# This is more advanced; we’ll skip it here for brevity.
# ============================
# 5. Randomized Search Setup
# ============================
cv = StratifiedKFold(n_splits=N_SPLITS_CV, shuffle=True, random_state=RANDOM_STATE)
random_search = RandomizedSearchCV(
estimator=xgb_pipeline,
param_distributions=param_dist,
n_iter=50, # Increase if you have time/resources
cv=cv,
scoring=SCORING_METRIC,
n_jobs=-1,
verbose=1,
random_state=RANDOM_STATE
)
logging.info("Starting RandomizedSearchCV for expanded XGBoost grid...")
random_search.fit(X_train_with_indicators, y_train)
logging.info("RandomizedSearchCV complete.")
best_xgb_model = random_search.best_estimator_
logging.info(f"Best parameters: {random_search.best_params_}")
logging.info(f"Best CV {SCORING_METRIC}: {random_search.best_score_:.4f}")
# ============================
# 6. Evaluate & Save Best Model
# ============================
trained_best_xgb = train_evaluate_model(
model=best_xgb_model,
X_train=X_train_with_indicators,
y_train=y_train,
X_test=X_test_with_indicators,
y_test=y_test,
model_name="Tuned XGBoost Model"
)
model_filename = os.path.expanduser('~/work/vaping_project_data/best_xgb_model.joblib')
joblib.dump(trained_best_xgb, model_filename)
logging.info(f"Final XGBoost model saved to: {model_filename}")
2025-02-15 10:33:01,124 - INFO - Starting RandomizedSearchCV for expanded XGBoost grid...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/model_selection/_validation.py:528: FitFailedWarning:
75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
return fit_method(estimator, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/pipeline.py", line 662, in fit
self._final_estimator.fit(Xt, y, **last_step_params["fit"])
File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/core.py", line 726, in inner_f
return func(**kwargs)
^^^^^^^^^^^^^^
File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/sklearn.py", line 1599, in fit
self._Booster = train(
^^^^^^
File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/core.py", line 726, in inner_f
return func(**kwargs)
^^^^^^^^^^^^^^
File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/training.py", line 181, in train
bst.update(dtrain, iteration=i, fobj=obj)
File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/core.py", line 2100, in update
_check_call(
File "/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/xgboost/core.py", line 284, in _check_call
raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: value 1.2 for Parameter colsample_bytree exceed bound [0,1]
colsample_bytree: Subsample ratio of columns, resample on each tree construction.
warnings.warn(some_fits_failed_message, FitFailedWarning)
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/model_selection/_search.py:1108: UserWarning: One or more of the test scores are non-finite: [ nan nan nan 0.90200238 0.90706627 nan
0.89999718 nan 0.89720182 0.85991863 0.87354831 nan
0.896498 0.89432499 0.91069245 nan 0.90742226 nan
0.86254671 0.89445416 0.82215935 0.83872734 0.89939779 0.9083536
0.89200625 0.90937551 0.90646569 nan 0.89624894 0.90608667
0.90722221 0.90915747 0.89949231 0.90255506 0.87744443 nan
0.90801846 0.88299783 0.84824171 0.89819916 nan 0.88255418
0.90983895 0.90093799 nan 0.90967376 nan nan
nan 0.90768914]
warnings.warn(
2025-02-15 10:35:58,854 - INFO - RandomizedSearchCV complete.
2025-02-15 10:35:58,856 - INFO - Best parameters: {'classifier__subsample': 0.6, 'classifier__reg_lambda': 4.0, 'classifier__reg_alpha': 0.5, 'classifier__n_estimators': 800, 'classifier__min_child_weight': 5, 'classifier__max_depth': 12, 'classifier__learning_rate': 0.01, 'classifier__gamma': 0.3, 'classifier__colsample_bytree': 1.0}
2025-02-15 10:35:58,857 - INFO - Best CV roc_auc: 0.9107
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
2025-02-15 10:38:07,097 - INFO - === Tuned XGBoost Model Evaluation ===
2025-02-15 10:38:07,103 - INFO - Confusion Matrix:
[[2384 545]
[ 598 3019]]
2025-02-15 10:38:07,116 - INFO -
Classification Report:
precision recall f1-score support
0.0 0.80 0.81 0.81 2929
1.0 0.85 0.83 0.84 3617
accuracy 0.83 6546
macro avg 0.82 0.82 0.82 6546
weighted avg 0.83 0.83 0.83 6546
2025-02-15 10:38:07,120 - INFO - ROC AUC: 0.9101
2025-02-15 10:38:07,350 - INFO - Final XGBoost model saved to: /storage/home/szn5432/work/vaping_project_data/best_xgb_model.joblib
In [ ]:
# Load the model (when needed)
def feature_engineering(X):
"""
Placeholder function where you can create domain-specific,
polynomial, or ratio features. This must return a DataFrame or array.
"""
# Example: create a simple ratio of two columns (if they exist)
# if 'colA' in X and 'colB' in X:
# X['ratio_A_B'] = X['colA'] / (X['colB'] + 1e-9)
return X
file_path = os.path.expanduser('~/work/vaping_project_data/best_xgb_model.joblib')
loaded_xgb = joblib.load(file_path)
print("Model loaded successfully.")
Model loaded successfully.
In [ ]:
# Function to plot feature importances
def plot_feature_importance(loaded_xgb, feature_names, top_n=20, title="Feature Importance"):
"""
Plots the top N feature importances from a trained model.
"""
if hasattr(loaded_xgb, 'feature_importances_'):
importances = loaded_xgb.feature_importances_
elif hasattr(loaded_xgb, 'named_steps') and 'classifier' in loaded_xgb.named_steps:
if hasattr(loaded_xgb.named_steps['classifier'], 'feature_importances_'):
importances = loaded_xgb.named_steps['classifier'].feature_importances_
else:
raise ValueError("Classifier does not have feature_importances_ attribute.")
else:
raise ValueError("Provided model does not have feature_importances_ attribute.")
fi_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
fi_df = fi_df.sort_values('Importance', ascending=False).head(top_n)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=fi_df)
plt.title(title)
plt.tight_layout()
plt.show()
# Get the feature names from the preprocessor
feature_names = loaded_xgb.named_steps['preprocessor'].get_feature_names_out()
# Plot the top 20 most important features
plot_feature_importance(loaded_xgb, feature_names, top_n=20, title="Top 20 Most Important Features")
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def plot_aggregated_feature_importance(model, top_n=20, title="Top 20 Aggregated Feature Importance"):
"""
Plots the top N aggregated feature importances for a CatBoost model
that is wrapped inside a Pipeline with a ColumnTransformer.
Parameters
----------
model : Pipeline
A scikit-learn Pipeline that includes:
- 'preprocessor': a ColumnTransformer or other transformer
- 'classifier': a CatBoostClassifier
top_n : int, optional (default=20)
How many top aggregated features to display.
title : str, optional
Title of the plot.
"""
# 1. Get the feature names after the preprocessor step
feature_names = loaded_xgb.named_steps['preprocessor'].get_feature_names_out()
# 2. Get the feature importances from CatBoost
xgboost_estimator = loaded_xgb.named_steps['classifier']
if not hasattr(xgboost_estimator, 'feature_importances_'):
raise AttributeError("The CatBoost classifier does not expose 'feature_importances_'.")
importances = xgboost_estimator.feature_importances_
# 3. Aggregate importances by original feature
aggregated_importance = {}
for name, imp in zip(feature_names, importances):
# Example naming conventions after ColumnTransformer + OneHotEncoder:
# "onehotencoder__Gender_Male"
# "remainder__Age"
# Adjust this parsing logic as necessary for your pipeline.
if "__" in name:
# Split on the double underscore to separate the transformer name vs. the actual column
parts = name.split("__", maxsplit=1)
# parts[0] might be 'onehotencoder' or 'remainder'
# parts[1] might be 'Gender_Male' or 'Age'
# We'll then split again on '_' if needed to get just the original column name
col_part = parts[1]
# If the column was numeric (remainder), it may be simply 'Age'.
# If the column was OHE, it might be 'Gender_Male' or 'Gender_Female'.
# A simple approach is to take everything before the first underscore as the feature name:
if "_" in col_part:
original_feature = col_part.split("_", maxsplit=1)[0]
else:
original_feature = col_part
else:
# If there's no double underscore, assume the whole name is the feature
original_feature = name
# Sum up the importances
aggregated_importance[original_feature] = aggregated_importance.get(original_feature, 0.0) + imp
# 4. Make a DataFrame of aggregated importances and sort
agg_df = pd.DataFrame(list(aggregated_importance.items()), columns=["Feature", "Importance"])
agg_df = agg_df.sort_values("Importance", ascending=False)
# 5. Plot top N aggregated feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=agg_df.head(top_n))
plt.title(title)
plt.tight_layout()
plt.show()
# --- Usage Example ---
# Assuming you have your best_model pipeline (with 'preprocessor' and CatBoost 'classifier'):
plot_aggregated_feature_importance(loaded_xgb, top_n=20, title="Top 20 Aggregated Feature Importance")
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay
def get_top_aggregated_features(loaded_xgb, top_n=10):
"""
Returns a list of top_n original (aggregated) feature names
based on the CatBoost feature importances in a Pipeline.
Parameters
----------
model : Pipeline
A scikit-learn Pipeline with steps:
- "preprocessor" : ColumnTransformer (or similar)
- "classifier" : CatBoostClassifier
top_n : int
Number of top features to return
Returns
-------
list of str
Top N aggregated feature names
"""
# Extract feature names that come out of the preprocessor
feature_names = loaded_xgb.named_steps['preprocessor'].get_feature_names_out()
# Extract importances from CatBoost
xgboost_estimator = loaded_xgb.named_steps['classifier']
importances = xgboost_estimator.feature_importances_
# Aggregate importances by the original (pre-encoding) feature name
aggregated_importance = {}
for name, imp in zip(feature_names, importances):
if "__" in name:
# Example: "onehotencoder__Gender_Male" -> original_feature = "Gender"
parts = name.split("__", maxsplit=1)
col_part = parts[1]
if "_" in col_part:
# For OHE columns like "Gender_Male"
original_feature = col_part.split("_", maxsplit=1)[0]
else:
# For remainder numeric columns
original_feature = col_part
else:
# If no __, assume name is the feature
original_feature = name
aggregated_importance[original_feature] = (
aggregated_importance.get(original_feature, 0.0) + imp
)
# Turn into a DataFrame, sort, and get top_n
agg_df = pd.DataFrame(
list(aggregated_importance.items()),
columns=["Feature", "Importance"]
).sort_values("Importance", ascending=False)
return agg_df.head(top_n)["Feature"].tolist()
# 1. Get the top 10 features by aggregated importance
top_features = get_top_aggregated_features(loaded_xgb, top_n=10)
print("Top 10 aggregated features:\n", top_features)
# Plot the partial dependence plot
for feat in top_features:
if feat not in X_train_with_indicators.columns:
print(f"Skipping feature '{feat}' as it is not found in the DataFrame.")
continue
fig, ax = plt.subplots(figsize=(6, 4))
PartialDependenceDisplay.from_estimator(
estimator=loaded_xgb,
X=X_train_with_indicators,
features=[feat],
kind='average',
grid_resolution=50,
target=1, # positive class for binary classification
ax=ax
)
plt.title(f"Partial Dependence of {feat}")
plt.show()
Top 10 aggregated features: ['wave', 'V2116', 'V2101', 'V2105', 'V2108', 'V2193', 'V2152', 'V2191', 'V2176', 'V2179']
In [ ]:
# Add this import at the top of your script
from scipy import sparse
# --- SHAP Feature Importance ---
# Suppose your ColumnTransformer has the name "cat" for the OneHotEncoder step
# and you pass in categorical_features as the input_features:
encoded_feature_names = (
preprocessor
.named_transformers_['cat'] # "cat" is the name of the OHE step in ColumnTransformer
.get_feature_names_out(input_features=categorical_features)
)
# Extract components from the pipeline
preprocessor = loaded_xgb.named_steps['preprocessor']
classifier = loaded_xgb.named_steps['classifier']
# Process the data through the pipeline
X_processed = preprocessor.transform(X_train_with_indicators)
# Convert sparse matrix to dense if needed
if isinstance(X_processed, (sparse.csr_matrix, sparse.csc_matrix)):
X_processed = X_processed.toarray()
# Create a SHAP explainer
explainer = shap.TreeExplainer(classifier)
# Calculate SHAP values (using a sample for faster computation)
sample_idx = np.random.choice(X_processed.shape[0], 100, replace=False)
shap_values = explainer.shap_values(X_processed[sample_idx])
# Get feature names from the preprocessor
feature_names = encoded_feature_names # From your existing code
In [ ]:
# Summary plot (feature importance)
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values,
X_processed[sample_idx],
feature_names=feature_names,
plot_type="bar",
show=False)
plt.title("SHAP Feature Importance (Mean Absolute Impact)")
plt.tight_layout()
plt.show()
# Detailed summary plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values,
X_processed[sample_idx],
feature_names=feature_names,
show=False)
plt.title("SHAP Value Distribution")
plt.tight_layout()
plt.show()
In [ ]:
# Force plot for average prediction
plt.figure()
shap.force_plot(explainer.expected_value,
shap_values[0],
X_processed[0],
feature_names=feature_names,
matplotlib=True,
show=False)
plt.title("SHAP Force Plot for First Sample")
plt.tight_layout()
plt.show()
<Figure size 640x480 with 0 Axes>
In [ ]:
# Extract original categorical features from one-hot encoded column names
import re
# Get the list of one-hot encoded column names
one_hot_encoded_columns = feature_names.tolist()
# Extract original categorical features by splitting at the first underscore or dot
original_categorical_features = list(set([re.split(r'[_.]', col)[0] for col in one_hot_encoded_columns]))
print("Original Categorical Features:")
print(original_categorical_features)
Original Categorical Features: ['V2188', 'V2163', 'V2116', 'V2134', 'V49', 'sex', 'V2105', 'V2175', 'V2460', 'V2128', 'V2108', 'V2195', 'missing', 'V2181', 'V2183', 'V2140', 'V2143', 'V2193', 'V2197', 'V2153', 'V2101', 'V2186', 'V2166', 'V2178', 'V2157', 'V2156', 'wave', 'V2137', 'V2196', 'V2187', 'V2182', 'V2176', 'V2180', 'V2185', 'V2125', 'V2171', 'RESPONDENT', 'V13', 'V2201', 'V2189', 'V2152', 'V2177', 'V2164', 'race', 'V2155', 'V2179', 'V2194', 'V2184', 'V2191', 'V2173', 'V2172']
In [ ]:
# List of original categorical features
original_categorical_features = [
'V2137', 'V2172', 'V2181', 'V2178', 'V2134', 'V2163', 'V2197', 'V2188', 'V2191', 'V2155',
'V2128', 'V2105', 'V2175', 'V2185', 'V2153', 'V2194', 'V2183', 'V2143', 'V2184', 'V2460',
'race', 'V2907', 'V2494', 'RESPONDENT', 'V2164', 'V2146', 'V49', 'V2182', 'V13', 'V2152',
'V2176', 'V2196', 'V2187', 'V2173', 'V2108', 'V2033', 'V2177', 'V2030', 'V2171', 'V2119',
'V2908', 'V2195', 'V2116', 'V2180', 'V2186', 'V2166', 'V2140', 'V2156', 'V2189', 'V2201',
'V2169', 'V2122', 'missing', 'sex', 'V2125', 'V2179', 'V2193', 'V2101', 'wave', 'V2157'
]
# Compute mean absolute SHAP values
mean_abs_shap_values = np.abs(shap_values).mean(axis=0)
# Create a dictionary to map one-hot encoded features to their original features
feature_mapping = {}
for feature in original_categorical_features:
feature_mapping[feature] = [col for col in feature_names if col.startswith(feature)]
# Aggregate SHAP values for each original feature
aggregated_shap_values = {}
for feature, cols in feature_mapping.items():
# Find the indices of the one-hot encoded columns for this feature
indices = [feature_names.tolist().index(col) for col in cols]
# Sum the mean absolute SHAP values for these columns
aggregated_shap_values[feature] = np.sum(mean_abs_shap_values[indices])
# Convert the aggregated SHAP values to a DataFrame
aggregated_shap_df = pd.DataFrame({
'Feature': list(aggregated_shap_values.keys()),
'Aggregated_SHAP': list(aggregated_shap_values.values())
})
# Sort by aggregated SHAP values in descending order
aggregated_shap_df = aggregated_shap_df.sort_values(by='Aggregated_SHAP', ascending=False)
# Display the top 20 aggregated features
top_n = 20 # Set to 20 for top 20 features
print("Top 20 Aggregated SHAP Features:")
print(aggregated_shap_df.head(top_n))
# Plot the top 20 aggregated features
plt.figure(figsize=(12, 8))
sns.barplot(x='Aggregated_SHAP', y='Feature', data=aggregated_shap_df.head(top_n), palette='viridis')
plt.title(f'Top {top_n} Aggregated SHAP Features')
plt.xlabel('Aggregated SHAP Value')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
Top 20 Aggregated SHAP Features: Feature Aggregated_SHAP 58 wave 0.840751 57 V2101 0.704158 42 V2116 0.459052 11 V2105 0.208962 24 V2164 0.107361 10 V2128 0.087983 55 V2179 0.085944 45 V2166 0.083725 29 V2152 0.074671 56 V2193 0.068521 12 V2175 0.061661 34 V2108 0.056134 30 V2176 0.055857 28 V13 0.054423 18 V2184 0.054002 17 V2143 0.049636 20 race 0.049562 41 V2195 0.048198 33 V2173 0.047251 36 V2177 0.045113
/tmp/ipykernel_1702033/2393572530.py:43: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='Aggregated_SHAP', y='Feature', data=aggregated_shap_df.head(top_n), palette='viridis')
In [ ]:
# Ensure SHAP is installed
import shap
shap.initjs() # For visualization in notebooks
# Compute SHAP interaction values (using a sample for faster computation)
sample_idx = np.random.choice(X_processed.shape[0], 100, replace=False) # Use a sample of 100 instances
shap_interaction_values = explainer.shap_interaction_values(X_processed[sample_idx])
# Get feature names from the preprocessor
feature_names = encoded_feature_names # From your existing code
In [ ]:
# Step 1: Store interactions and their values, avoiding duplicates and self-interactions
interaction_results = []
for i, feature_i in enumerate(original_categorical_features):
for j, feature_j in enumerate(original_categorical_features):
# Skip self-interactions
if feature_i == feature_j:
continue
# Ensure unique pairs by sorting feature names
feature_pair = tuple(sorted([feature_i, feature_j]))
# Skip if the pair is already processed
if feature_pair in [result[0] for result in interaction_results]:
continue
# Get indices for feature_i and feature_j
indices_i = [feature_names.tolist().index(col) for col in feature_mapping[feature_i] if col in feature_names.tolist()]
indices_j = [feature_names.tolist().index(col) for col in feature_mapping[feature_j] if col in feature_names.tolist()]
if not indices_i or not indices_j:
continue
# Compute interaction value
value = np.sum(np.abs(shap_interaction_values[:, indices_i, :][:, :, indices_j]))
interaction_results.append((feature_pair, value))
# Step 2: Sort the interactions by their values
sorted_interactions = sorted(interaction_results, key=lambda x: x[1], reverse=True)
# Step 3: Convert results to a DataFrame for easy aggregation
interaction_df = pd.DataFrame(sorted_interactions, columns=["Feature Pair", "Interaction Value"])
interaction_df[["Feature A", "Feature B"]] = pd.DataFrame(interaction_df["Feature Pair"].tolist(), index=interaction_df.index)
interaction_df = interaction_df.drop(columns=["Feature Pair"])
# Step 4: Select the top 30 interactions
top_30_interactions = interaction_df.sort_values(by="Interaction Value", ascending=False).head(30)
# Step 5: Display the results
print("Top 30 Feature Interactions (Unique Pairs):")
print(top_30_interactions)
# Step 6: Visualize the top 30 interactions
import matplotlib.pyplot as plt
# Create a bar plot
plt.figure(figsize=(12, 8))
plt.barh(
top_30_interactions.apply(lambda row: f"{row['Feature A']} & {row['Feature B']}", axis=1),
top_30_interactions["Interaction Value"],
color="skyblue",
)
plt.xlabel("Interaction Value")
plt.ylabel("Feature Pairs")
plt.title("Top 30 Feature Interactions (Unique Pairs)")
plt.gca().invert_yaxis() # Invert y-axis for better readability
plt.tight_layout()
plt.show()
Top 30 Feature Interactions (Unique Pairs):
Interaction Value Feature A Feature B
0 94.461174 V2116 wave
1 68.112961 V2105 wave
2 57.410591 V2101 wave
3 29.019850 V2101 V2116
4 24.331301 V2105 V2116
5 21.626293 V2108 wave
6 14.987869 V2101 V2105
7 12.275227 V2108 V2116
8 11.319855 V2164 wave
9 11.317589 V2166 wave
10 11.201500 race wave
11 11.175924 V2196 wave
12 10.676391 V2163 wave
13 10.323991 V2194 wave
14 9.517755 V2179 wave
15 9.090555 V2152 wave
16 8.913261 V2191 wave
17 8.352642 V2176 wave
18 7.616253 V2101 V2108
19 7.319203 V2116 V2166
20 7.314087 V13 wave
21 6.935249 V2173 wave
22 5.815115 V2197 wave
23 5.635795 V2101 V2166
24 5.446934 V2193 wave
25 5.402956 V2195 wave
26 5.175810 V13 V2101
27 5.164973 V2175 wave
28 5.072132 V2172 wave
29 4.982261 V2116 V2152
CatBoost¶
In [ ]:
import os
import logging
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import MissingIndicator
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from scipy.stats import uniform, randint
# Create a pipeline with the preprocessor and CatBoost classifier
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', CatBoostClassifier(
iterations= 500, # Increase the number of iterations if needed
learning_rate=0.1,
depth=6,
loss_function='Logloss',
verbose=0,
random_seed=RANDOM_STATE
))
])
# Define the parameter distribution for RandomizedSearchCV
param_dist = {
'classifier__iterations': randint(1000, 5000), # Wider range for iterations
'classifier__learning_rate': uniform(0.01, 0.3), # Wider range for learning rate
'classifier__depth': randint(4, 12), # Wider range for depth
'classifier__l2_leaf_reg': uniform(1e-2, 10), # L2 regularization
'classifier__border_count': randint(32, 255), # Border count
'classifier__bagging_temperature': uniform(0, 1), # Bagging temperature
'classifier__random_strength': uniform(1e-9, 10), # Random strength
'classifier__od_type': ['IncToDec', 'Iter'], # Overfitting detector type
'classifier__od_wait': randint(10, 50) # Overfitting detector wait
}
# Perform RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
estimator=pipeline,
param_distributions=param_dist,
n_iter=20, # Increase the number of parameter settings sampled if needed
cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
scoring=SCORING_METRIC,
n_jobs=CPU_COUNT,
verbose=VERBOSE,
random_state=RANDOM_STATE
)
random_search.fit(X_train_with_indicators, y_train)
# Log the best parameters and score
logging.info("Best parameters found: " + str(random_search.best_params_))
logging.info(f"Best cross-validation {SCORING_METRIC}: {random_search.best_score_:.4f}")
# Evaluate the best model
best_model = random_search.best_estimator_
# Transform the data using the fitted preprocessor in the best model
X_train_transformed = best_model.named_steps['preprocessor'].transform(X_train_with_indicators)
X_test_transformed = best_model.named_steps['preprocessor'].transform(X_test_with_indicators)
# Predict and evaluate
y_pred = best_model.predict(X_test_with_indicators)
y_pred_proba = best_model.predict_proba(X_test_with_indicators)[:, 1]
logging.info("=== Best CatBoost Model Evaluation ===")
logging.info("Confusion Matrix:\n" + str(confusion_matrix(y_test, y_pred)))
logging.info("\nClassification Report:\n" + str(classification_report(y_test, y_pred)))
logging.info(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_pred_proba):.4f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Best CatBoost Model ROC Curve on Test Data')
plt.legend(loc='lower right')
plt.show()
# Save the best model
model_save_path = 'best_catboost_model.pkl'
joblib.dump(best_model, model_save_path)
logging.info(f"Best CatBoost model saved to '{model_save_path}'.")
Fitting 3 folds for each of 20 candidates, totalling 60 fits
2025-02-15 09:46:30,631 - INFO - Best parameters found: {'classifier__bagging_temperature': 0.45924889196586716, 'classifier__border_count': 148, 'classifier__depth': 7, 'classifier__iterations': 3919, 'classifier__l2_leaf_reg': 7.090725777960454, 'classifier__learning_rate': 0.016175348288740735, 'classifier__od_type': 'Iter', 'classifier__od_wait': 33, 'classifier__random_strength': 8.324426409004218}
2025-02-15 09:46:30,633 - INFO - Best cross-validation roc_auc: 0.9065
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
/storage/home/szn5432/work/vaping_project/.venv/lib64/python3.11/site-packages/sklearn/preprocessing/_encoders.py:246: UserWarning: Found unknown categories in columns [41] during transform. These unknown categories will be encoded as all zeros
warnings.warn(
2025-02-15 09:46:31,436 - INFO - === Best CatBoost Model Evaluation ===
2025-02-15 09:46:31,443 - INFO - Confusion Matrix:
[[2365 564]
[ 580 3037]]
2025-02-15 09:46:31,456 - INFO -
Classification Report:
precision recall f1-score support
0.0 0.80 0.81 0.81 2929
1.0 0.84 0.84 0.84 3617
accuracy 0.83 6546
macro avg 0.82 0.82 0.82 6546
weighted avg 0.83 0.83 0.83 6546
2025-02-15 09:46:31,460 - INFO - ROC AUC: 0.9107
2025-02-15 09:46:31,706 - INFO - Best CatBoost model saved to 'best_catboost_model.pkl'.
In [ ]:
model_filename = os.path.expanduser('~/work/vaping_project_data/best_cb_model.joblib')
joblib.dump(best_model, model_filename)
logging.info(f"Final CatBoost model saved to: {model_filename}")
2025-02-15 09:49:03,429 - INFO - Final CatBoost model saved to: /storage/home/szn5432/work/vaping_project_data/best_cb_model.joblib
In [ ]:
# Load the model (when needed)
file_path = os.path.expanduser('~/work/vaping_project_data/best_cb_model.joblib')
loaded_cb = joblib.load(file_path)
print("Model loaded successfully.")
Model loaded successfully.
In [ ]:
# Function to plot feature importances
def plot_feature_importance(loaded_cb, feature_names, top_n=20, title="Feature Importance"):
"""
Plots the top N feature importances from a trained model.
"""
if hasattr(loaded_cb, 'feature_importances_'):
importances = loaded_cb.feature_importances_
elif hasattr(loaded_cb, 'named_steps') and 'classifier' in loaded_cb.named_steps:
if hasattr(loaded_cb.named_steps['classifier'], 'feature_importances_'):
importances = loaded_cb.named_steps['classifier'].feature_importances_
else:
raise ValueError("Classifier does not have feature_importances_ attribute.")
else:
raise ValueError("Provided model does not have feature_importances_ attribute.")
fi_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
fi_df = fi_df.sort_values('Importance', ascending=False).head(top_n)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=fi_df)
plt.title(title)
plt.tight_layout()
plt.show()
# Get the feature names from the preprocessor
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()
# Plot the top 20 most important features
plot_feature_importance(best_model, feature_names, top_n=20, title="Top 20 Most Important Features")
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def plot_aggregated_feature_importance(model, top_n=20, title="Top 20 Aggregated Feature Importance"):
"""
Plots the top N aggregated feature importances for a CatBoost model
that is wrapped inside a Pipeline with a ColumnTransformer.
Parameters
----------
model : Pipeline
A scikit-learn Pipeline that includes:
- 'preprocessor': a ColumnTransformer or other transformer
- 'classifier': a CatBoostClassifier
top_n : int, optional (default=20)
How many top aggregated features to display.
title : str, optional
Title of the plot.
"""
# 1. Get the feature names after the preprocessor step
feature_names = loaded_cb.named_steps['preprocessor'].get_feature_names_out()
# 2. Get the feature importances from CatBoost
catboost_estimator = loaded_cb.named_steps['classifier']
if not hasattr(catboost_estimator, 'feature_importances_'):
raise AttributeError("The CatBoost classifier does not expose 'feature_importances_'.")
importances = catboost_estimator.feature_importances_
# 3. Aggregate importances by original feature
aggregated_importance = {}
for name, imp in zip(feature_names, importances):
# Example naming conventions after ColumnTransformer + OneHotEncoder:
# "onehotencoder__Gender_Male"
# "remainder__Age"
# Adjust this parsing logic as necessary for your pipeline.
if "__" in name:
# Split on the double underscore to separate the transformer name vs. the actual column
parts = name.split("__", maxsplit=1)
# parts[0] might be 'onehotencoder' or 'remainder'
# parts[1] might be 'Gender_Male' or 'Age'
# We'll then split again on '_' if needed to get just the original column name
col_part = parts[1]
# If the column was numeric (remainder), it may be simply 'Age'.
# If the column was OHE, it might be 'Gender_Male' or 'Gender_Female'.
# A simple approach is to take everything before the first underscore as the feature name:
if "_" in col_part:
original_feature = col_part.split("_", maxsplit=1)[0]
else:
original_feature = col_part
else:
# If there's no double underscore, assume the whole name is the feature
original_feature = name
# Sum up the importances
aggregated_importance[original_feature] = aggregated_importance.get(original_feature, 0.0) + imp
# 4. Make a DataFrame of aggregated importances and sort
agg_df = pd.DataFrame(list(aggregated_importance.items()), columns=["Feature", "Importance"])
agg_df = agg_df.sort_values("Importance", ascending=False)
# 5. Plot top N aggregated feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=agg_df.head(top_n))
plt.title(title)
plt.tight_layout()
plt.show()
# --- Usage Example ---
# Assuming you have your best_model pipeline (with 'preprocessor' and CatBoost 'classifier'):
plot_aggregated_feature_importance(best_model, top_n=20, title="Top 20 Aggregated Feature Importance")
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay
def get_top_aggregated_features(loaded_cb, top_n=10):
"""
Returns a list of top_n original (aggregated) feature names
based on the CatBoost feature importances in a Pipeline.
Parameters
----------
model : Pipeline
A scikit-learn Pipeline with steps:
- "preprocessor" : ColumnTransformer (or similar)
- "classifier" : CatBoostClassifier
top_n : int
Number of top features to return
Returns
-------
list of str
Top N aggregated feature names
"""
# Extract feature names that come out of the preprocessor
feature_names = loaded_cb.named_steps['preprocessor'].get_feature_names_out()
# Extract importances from CatBoost
catboost_estimator = loaded_cb.named_steps['classifier']
importances = catboost_estimator.feature_importances_
# Aggregate importances by the original (pre-encoding) feature name
aggregated_importance = {}
for name, imp in zip(feature_names, importances):
if "__" in name:
# Example: "onehotencoder__Gender_Male" -> original_feature = "Gender"
parts = name.split("__", maxsplit=1)
col_part = parts[1]
if "_" in col_part:
# For OHE columns like "Gender_Male"
original_feature = col_part.split("_", maxsplit=1)[0]
else:
# For remainder numeric columns
original_feature = col_part
else:
# If no __, assume name is the feature
original_feature = name
aggregated_importance[original_feature] = (
aggregated_importance.get(original_feature, 0.0) + imp
)
# Turn into a DataFrame, sort, and get top_n
agg_df = pd.DataFrame(
list(aggregated_importance.items()),
columns=["Feature", "Importance"]
).sort_values("Importance", ascending=False)
return agg_df.head(top_n)["Feature"].tolist()
# 1. Get the top 10 features by aggregated importance
top_features = get_top_aggregated_features(loaded_cb, top_n=10)
print("Top 10 aggregated features:\n", top_features)
# Plot the partial dependence plot
for feat in top_features:
if feat not in X_train_with_indicators.columns:
print(f"Skipping feature '{feat}' as it is not found in the DataFrame.")
continue
fig, ax = plt.subplots(figsize=(6, 4))
PartialDependenceDisplay.from_estimator(
estimator=loaded_cb,
X=X_train_with_indicators,
features=[feat],
kind='average',
grid_resolution=50,
target=1, # positive class for binary classification
ax=ax
)
plt.title(f"Partial Dependence of {feat}")
plt.show()
Top 10 aggregated features: ['wave', 'V2116', 'V2105', 'V2101', 'race', 'V2196', 'V2152', 'V2191', 'V2179', 'V2108']
In [ ]:
import shap
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
X_train_transformed = loaded_cb.named_steps['preprocessor'].transform(X_train_with_indicators)
def plot_top_shap_aggregated_features(model, X_train_transformed, feature_names, top_n=20, title="Top 20 SHAP Aggregated Features"):
"""
Plots the top N aggregated SHAP feature importances for a model.
Parameters
----------
model : Pipeline
A scikit-learn Pipeline that includes:
- 'preprocessor': a ColumnTransformer or other transformer
- 'classifier': a CatBoostClassifier
X_train_transformed : array-like
Transformed training data (output of the preprocessor).
feature_names : array-like
Feature names after transformation.
top_n : int, optional (default=20)
How many top aggregated features to display.
title : str, optional
Title of the plot.
"""
# 1. Extract the classifier from the pipeline
catboost_estimator = loaded_cb.named_steps['classifier']
# 2. Initialize SHAP explainer for the CatBoost model
explainer = shap.TreeExplainer(catboost_estimator)
# 3. Compute SHAP values for the transformed training data
shap_values = explainer.shap_values(X_train_transformed)
# For classification problems, shap_values is a list (one element per class).
# We use the positive class for binary classification (index 1).
if isinstance(shap_values, list):
shap_values = shap_values[1]
# 4. Aggregate SHAP values back to the original feature names
aggregated_shap = {}
for i, name in enumerate(feature_names):
# Parse original feature name from encoded feature name
if "__" in name:
col_part = name.split("__", maxsplit=1)[1] # Split after the double underscore
if "_" in col_part:
original_feature = col_part.split("_", maxsplit=1)[0]
else:
original_feature = col_part
else:
original_feature = name
# Sum SHAP values for the same original feature
aggregated_shap[original_feature] = aggregated_shap.get(original_feature, 0.0) + abs(shap_values[:, i]).mean()
# 5. Create a DataFrame of aggregated SHAP values and sort by importance
shap_df = pd.DataFrame(list(aggregated_shap.items()), columns=["Feature", "SHAP Importance"])
shap_df = shap_df.sort_values("SHAP Importance", ascending=False)
# 6. Plot the top N SHAP aggregated feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x="SHAP Importance", y="Feature", data=shap_df.head(top_n))
plt.title(title)
plt.tight_layout()
plt.show()
# --- Usage Example ---
# Assuming you have:
# - 'best_model': your fitted pipeline
# - 'X_train_transformed': the transformed training data
# - 'feature_names': the output of 'get_feature_names_out()' from the preprocessor
plot_top_shap_aggregated_features(
model=best_model,
X_train_transformed=X_train_transformed,
feature_names=best_model.named_steps['preprocessor'].get_feature_names_out(),
top_n=20,
title="Top 20 SHAP Aggregated Features"
)
In [ ]:
###################################
# SHAP Interactions
###################################
import shap
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# --- Step 1: Extract the classifier from the pipeline ---
catboost_estimator = loaded_cb.named_steps['classifier']
# --- Step 2: Initialize SHAP explainer ---
explainer = shap.TreeExplainer(catboost_estimator)
In [ ]:
# Sample a subset of the training data
sample_size = 2 # Adjust based on available memory and dataset size
X_train_sample = X_train_with_indicators.sample(n=sample_size, random_state=42)
# Transform the sampled data
X_train_sample_transformed = loaded_cb.named_steps['preprocessor'].transform(X_train_sample)
# Compute SHAP interaction values for the sample
interaction_values = explainer.shap_interaction_values(X_train_sample_transformed)
In [ ]:
import numpy as np
import pandas as pd
# List of original categorical features
original_categorical_features = [
'V2137', 'V2172', 'V2181', 'V2178', 'V2134', 'V2163', 'V2197', 'V2188', 'V2191', 'V2155',
'V2128', 'V2105', 'V2175', 'V2185', 'V2153', 'V2194', 'V2183', 'V2143', 'V2184', 'V2460',
'race', 'V2907', 'V2494', 'RESPONDENT', 'V2164', 'V2146', 'V49', 'V2182', 'V13', 'V2152',
'V2176', 'V2196', 'V2187', 'V2173', 'V2108', 'V2033', 'V2177', 'V2030', 'V2171', 'V2119',
'V2908', 'V2195', 'V2116', 'V2180', 'V2186', 'V2166', 'V2140', 'V2156', 'V2189', 'V2201',
'V2169', 'V2122', 'missing', 'sex', 'V2125', 'V2179', 'V2193', 'V2101', 'wave', 'V2157'
]
# Get the feature names from your fitted pipeline
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()
# Inspect them to see how they're named
print("Transformed feature names:\n", feature_names)
# A helper function to check if a transformed column belongs to a given original feature
def belongs_to_original(col_name: str, original_feat: str) -> bool:
parts = col_name.split("__", maxsplit=1)
if len(parts) == 2:
encoded_part = parts[1] # e.g. "V2137_0"
else:
encoded_part = parts[0]
return (encoded_part == original_feat) or encoded_part.startswith(original_feat + "_")
# Create a dictionary to map each original categorical feature to its transformed columns
feature_mapping = {}
for feature in original_categorical_features:
matched_cols = [
col for col in feature_names
if belongs_to_original(col, feature)
]
feature_mapping[feature] = matched_cols
# Now do your SHAP interaction aggregation
aggregated_interaction_matrix = np.zeros((len(original_categorical_features), len(original_categorical_features)))
for i, feature_i in enumerate(original_categorical_features):
for j, feature_j in enumerate(original_categorical_features):
indices_i = [feature_names.tolist().index(c) for c in feature_mapping[feature_i]]
indices_j = [feature_names.tolist().index(c) for c in feature_mapping[feature_j]]
# Sum absolute interaction values for these columns
aggregated_interaction_matrix[i, j] = np.sum(np.abs(interaction_values[:, indices_i, :][:, :, indices_j]))
aggregated_interaction_df = pd.DataFrame(
aggregated_interaction_matrix,
index=original_categorical_features,
columns=original_categorical_features
)
print("Aggregated Interaction DataFrame:\n", aggregated_interaction_df)
Transformed feature names:
['cat__V2178_2.0' 'cat__V2178_3.0' 'cat__V2178_4.0' 'cat__V2178_5.0'
'cat__V2178_6.0' 'cat__V2178_nan' 'cat__V2188_1.0' 'cat__V2188_nan'
'cat__V2197_1.0' 'cat__V2197_2.0' 'cat__V2197_3.0' 'cat__V2197_4.0'
'cat__V2197_nan' 'cat__V2184_2.0' 'cat__V2184_3.0' 'cat__V2184_4.0'
'cat__V2184_nan' 'cat__V2186_1.0' 'cat__V2186_nan' 'cat__V2171_2.0'
'cat__V2171_6.0' 'cat__V2171_nan' 'cat__V2128_2.0' 'cat__V2128_3.0'
'cat__V2128_4.0' 'cat__V2128_5.0' 'cat__V2128_6.0' 'cat__V2128_7.0'
'cat__V2128_nan' 'cat__V2201_1.0' 'cat__V2201_2.0' 'cat__V2201_3.0'
'cat__V2201_4.0' 'cat__V2201_nan' 'cat__V2173_2.0' 'cat__V2173_3.0'
'cat__V2173_4.0' 'cat__V2173_5.0' 'cat__V2173_6.0' 'cat__V2173_7.0'
'cat__V2173_nan' 'cat__V2194_2.0' 'cat__V2194_3.0' 'cat__V2194_4.0'
'cat__V2194_5.0' 'cat__V2194_6.0' 'cat__V2194_nan' 'cat__V2166_2.0'
'cat__V2166_3.0' 'cat__V2166_4.0' 'cat__V2166_5.0' 'cat__V2166_6.0'
'cat__V2166_7.0' 'cat__V2166_8.0' 'cat__V2166_nan' 'cat__wave_2018'
'cat__wave_2019' 'cat__wave_2020' 'cat__wave_2021' 'cat__wave_2022'
'cat__wave_2023' 'cat__V2176_2.0' 'cat__V2176_3.0' 'cat__V2176_4.0'
'cat__V2176_5.0' 'cat__V2176_6.0' 'cat__V2176_7.0' 'cat__V2176_nan'
'cat__V2175_2.0' 'cat__V2175_3.0' 'cat__V2175_4.0' 'cat__V2175_5.0'
'cat__V2175_6.0' 'cat__V2175_7.0' 'cat__V2175_nan' 'cat__V2177_2.0'
'cat__V2177_3.0' 'cat__V2177_4.0' 'cat__V2177_5.0' 'cat__V2177_6.0'
'cat__V2177_7.0' 'cat__V2177_nan' 'cat__V2116_2.0' 'cat__V2116_3.0'
'cat__V2116_4.0' 'cat__V2116_5.0' 'cat__V2116_6.0' 'cat__V2116_7.0'
'cat__V2116_nan' 'cat__V2125_2.0' 'cat__V2125_3.0' 'cat__V2125_4.0'
'cat__V2125_5.0' 'cat__V2125_6.0' 'cat__V2125_7.0' 'cat__V2125_nan'
'cat__V2182_2.0' 'cat__V2182_3.0' 'cat__V2182_4.0' 'cat__V2182_nan'
'cat__sex_1.0' 'cat__race_2.0' 'cat__race_3.0' 'cat__V2460_2.0'
'cat__V2460_3.0' 'cat__V2460_4.0' 'cat__V2460_5.0' 'cat__V2460_6.0'
'cat__V2460_7.0' 'cat__V2460_nan' 'cat__RESPONDENT_AGE_2.0'
'cat__RESPONDENT_AGE_nan' 'cat__V2185_1.0' 'cat__V2185_nan'
'cat__V2193_2.0' 'cat__V2193_3.0' 'cat__V2193_4.0' 'cat__V2193_5.0'
'cat__V2193_6.0' 'cat__V2193_7.0' 'cat__V2193_8.0' 'cat__V2193_9.0'
'cat__V2193_10.0' 'cat__V2193_nan' 'cat__V2163_2.0' 'cat__V2163_3.0'
'cat__V2163_4.0' 'cat__V2163_5.0' 'cat__V2163_6.0' 'cat__V2163_7.0'
'cat__V2163_nan' 'cat__V49_1.0' 'cat__V49_2.0' 'cat__V49_3.0'
'cat__V49_nan' 'cat__V2108_2.0' 'cat__V2108_3.0' 'cat__V2108_4.0'
'cat__V2108_5.0' 'cat__V2108_6.0' 'cat__V2108_nan' 'cat__V2101_2.0'
'cat__V2101_3.0' 'cat__V2101_4.0' 'cat__V2101_5.0' 'cat__V2101_nan'
'cat__V2180_2.0' 'cat__V2180_3.0' 'cat__V2180_4.0' 'cat__V2180_nan'
'cat__V2164_2.0' 'cat__V2164_3.0' 'cat__V2164_4.0' 'cat__V2164_5.0'
'cat__V2164_6.0' 'cat__V2164_7.0' 'cat__V2164_nan' 'cat__V2191_2.0'
'cat__V2191_3.0' 'cat__V2191_4.0' 'cat__V2191_5.0' 'cat__V2191_6.0'
'cat__V2191_7.0' 'cat__V2191_8.0' 'cat__V2191_nan' 'cat__V2195_2.0'
'cat__V2195_3.0' 'cat__V2195_4.0' 'cat__V2195_5.0' 'cat__V2195_6.0'
'cat__V2195_nan' 'cat__V2155_1.0' 'cat__V2155_nan' 'cat__V2196_2.0'
'cat__V2196_3.0' 'cat__V2196_4.0' 'cat__V2196_5.0' 'cat__V2196_6.0'
'cat__V2196_nan' 'cat__V2189_1.0' 'cat__V2189_nan' 'cat__V2179_2.0'
'cat__V2179_3.0' 'cat__V2179_4.0' 'cat__V2179_5.0' 'cat__V2179_6.0'
'cat__V2179_7.0' 'cat__V2179_8.0' 'cat__V2179_9.0' 'cat__V2179_nan'
'cat__V13_2' 'cat__V13_3' 'cat__V13_4' 'cat__V2143_2.0' 'cat__V2143_3.0'
'cat__V2143_4.0' 'cat__V2143_5.0' 'cat__V2143_6.0' 'cat__V2143_7.0'
'cat__V2143_nan' 'cat__V2134_2.0' 'cat__V2134_3.0' 'cat__V2134_4.0'
'cat__V2134_5.0' 'cat__V2134_6.0' 'cat__V2134_7.0' 'cat__V2134_nan'
'cat__V2172_2.0' 'cat__V2172_3.0' 'cat__V2172_4.0' 'cat__V2172_nan'
'cat__V2137_2.0' 'cat__V2137_3.0' 'cat__V2137_4.0' 'cat__V2137_5.0'
'cat__V2137_6.0' 'cat__V2137_7.0' 'cat__V2137_nan' 'cat__V2140_2.0'
'cat__V2140_4.0' 'cat__V2140_5.0' 'cat__V2140_6.0' 'cat__V2140_7.0'
'cat__V2140_nan' 'cat__V2105_2.0' 'cat__V2105_3.0' 'cat__V2105_4.0'
'cat__V2105_5.0' 'cat__V2105_6.0' 'cat__V2105_7.0' 'cat__V2105_nan'
'cat__V2157_1.0' 'cat__V2157_nan' 'cat__V2183_2.0' 'cat__V2183_3.0'
'cat__V2183_4.0' 'cat__V2183_nan' 'cat__V2187_1.0' 'cat__V2187_nan'
'cat__V2181_2.0' 'cat__V2181_3.0' 'cat__V2181_4.0' 'cat__V2181_nan'
'cat__V2152_1.0' 'cat__V2152_2.0' 'cat__V2152_3.0' 'cat__V2152_4.0'
'cat__V2152_5.0' 'cat__V2152_6.0' 'cat__V2152_7.0' 'cat__V2152_8.0'
'cat__V2152_9.0' 'cat__V2152_nan' 'cat__V2153_2.0' 'cat__V2153_3.0'
'cat__V2153_4.0' 'cat__V2153_nan' 'cat__V2156_1.0' 'cat__V2156_nan'
'cat__missing_V2178_True' 'cat__missing_V2188_True'
'cat__missing_V2197_True' 'cat__missing_V2184_True'
'cat__missing_V2186_True' 'cat__missing_V2171_True'
'cat__missing_V2128_True' 'cat__missing_V2201_True'
'cat__missing_V2173_True' 'cat__missing_V2194_True'
'cat__missing_V2166_True' 'cat__missing_V2176_True'
'cat__missing_V2175_True' 'cat__missing_V2177_True'
'cat__missing_V2116_True' 'cat__missing_V2125_True'
'cat__missing_V2182_True' 'cat__missing_V2460_True'
'cat__missing_RESPONDENT_AGE_True' 'cat__missing_V2185_True'
'cat__missing_V2193_True' 'cat__missing_V2163_True'
'cat__missing_V49_True' 'cat__missing_V2108_True'
'cat__missing_V2101_True' 'cat__missing_V2180_True'
'cat__missing_V2164_True' 'cat__missing_V2191_True'
'cat__missing_V2195_True' 'cat__missing_V2155_True'
'cat__missing_V2196_True' 'cat__missing_V2189_True'
'cat__missing_V2179_True' 'cat__missing_V2143_True'
'cat__missing_V2134_True' 'cat__missing_V2172_True'
'cat__missing_V2137_True' 'cat__missing_V2140_True'
'cat__missing_V2105_True' 'cat__missing_V2157_True'
'cat__missing_V2183_True' 'cat__missing_V2187_True'
'cat__missing_V2181_True' 'cat__missing_V2152_True'
'cat__missing_V2153_True' 'cat__missing_V2156_True']
Aggregated Interaction DataFrame:
V2137 V2172 V2181 V2178 V2134 \
V2137 3.290348e-03 0.000058 0.000057 0.000046 1.228783e-05
V2172 5.801001e-05 0.036453 0.001411 0.000912 2.559703e-04
V2181 5.727996e-05 0.001411 0.015649 0.000689 2.132224e-05
V2178 4.601106e-05 0.000912 0.000689 0.017057 1.084229e-04
V2134 1.228783e-05 0.000256 0.000021 0.000108 3.977924e-03
V2163 1.619578e-04 0.002878 0.001778 0.002497 1.119420e-03
V2197 4.423864e-05 0.001065 0.000123 0.000389 3.886163e-05
V2188 2.309646e-04 0.000351 0.001489 0.001942 5.925815e-04
V2191 1.115818e-04 0.002676 0.001730 0.001493 1.027248e-04
V2155 3.853873e-07 0.000674 0.001473 0.000789 8.013086e-08
V2128 1.045936e-04 0.000170 0.000046 0.000043 3.988419e-06
V2105 2.061868e-04 0.012276 0.006536 0.021041 5.858707e-04
V2175 5.298451e-05 0.002327 0.001250 0.001751 8.722402e-05
V2185 1.022528e-05 0.000353 0.000503 0.000028 1.659341e-05
V2153 3.865524e-05 0.001057 0.000200 0.000105 1.011352e-05
V2194 3.518209e-04 0.002630 0.001592 0.001795 2.106592e-04
V2183 4.126358e-04 0.002614 0.001698 0.001640 1.325228e-04
V2143 6.160799e-05 0.000144 0.000035 0.000030 5.761200e-06
V2184 1.179433e-04 0.010305 0.001919 0.002645 1.882302e-04
V2460 4.623492e-06 0.000091 0.000040 0.000063 3.028373e-06
race 4.579310e-05 0.000269 0.000887 0.001670 1.160390e-04
V2907 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00
V2494 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00
RESPONDENT 6.074983e-05 0.000633 0.000610 0.000591 4.204937e-05
V2164 4.390109e-04 0.003228 0.001492 0.003086 4.721385e-04
V2146 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00
V49 1.367812e-04 0.002162 0.000863 0.003248 7.521613e-05
V2182 6.599708e-05 0.002539 0.000586 0.000529 8.014844e-05
V13 1.001896e-04 0.002855 0.001863 0.003177 2.432611e-05
V2152 2.515096e-04 0.002587 0.001081 0.001536 3.369002e-04
V2176 9.971148e-05 0.003391 0.000948 0.003888 6.190503e-05
V2196 6.279371e-05 0.002950 0.003280 0.002430 4.967960e-05
V2187 7.196244e-07 0.000026 0.000208 0.004478 1.288283e-05
V2173 1.458582e-04 0.001530 0.001926 0.005064 2.275831e-04
V2108 4.532427e-05 0.000435 0.000469 0.001280 3.270080e-05
V2033 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00
V2177 5.123756e-05 0.003809 0.001393 0.001134 3.953874e-04
V2030 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00
V2171 3.025585e-06 0.000046 0.000031 0.000029 2.299128e-07
V2119 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00
V2908 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00
V2195 6.728445e-05 0.002004 0.001184 0.000895 4.059375e-05
V2116 2.368459e-04 0.004952 0.001921 0.002697 1.920520e-04
V2180 5.100981e-05 0.002118 0.000804 0.001856 1.427523e-04
V2186 1.284499e-06 0.000304 0.000145 0.000033 1.145650e-06
V2166 2.849611e-04 0.007115 0.001056 0.002999 6.797044e-04
V2140 2.244612e-06 0.000019 0.000003 0.000005 2.623857e-05
V2156 1.591964e-05 0.000763 0.000048 0.000230 0.000000e+00
V2189 1.809382e-05 0.000265 0.000143 0.000152 2.749115e-05
V2201 2.057817e-05 0.000705 0.000431 0.000557 2.771747e-05
V2169 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00
V2122 0.000000e+00 0.000000 0.000000 0.000000 0.000000e+00
missing 1.302991e-04 0.001753 0.001000 0.001227 1.443024e-04
sex 4.855872e-05 0.002203 0.003122 0.001071 2.214620e-05
V2125 6.945713e-06 0.000120 0.000078 0.000046 4.987354e-06
V2179 8.244927e-05 0.005191 0.001297 0.006686 2.424108e-04
V2193 1.410531e-04 0.001559 0.001805 0.006972 8.529841e-05
V2101 6.977184e-05 0.000516 0.000538 0.000440 6.703475e-05
wave 1.441623e-03 0.022204 0.012419 0.024816 9.281955e-04
V2157 9.000518e-05 0.011629 0.000145 0.000514 4.520196e-05
V2163 V2197 V2188 V2191 V2155 ... V2169 \
V2137 0.000162 0.000044 0.000231 0.000112 3.853873e-07 ... 0.0
V2172 0.002878 0.001065 0.000351 0.002676 6.741730e-04 ... 0.0
V2181 0.001778 0.000123 0.001489 0.001730 1.472853e-03 ... 0.0
V2178 0.002497 0.000389 0.001942 0.001493 7.889872e-04 ... 0.0
V2134 0.001119 0.000039 0.000593 0.000103 8.013086e-08 ... 0.0
V2163 0.031062 0.000350 0.011878 0.006844 1.333255e-03 ... 0.0
V2197 0.000350 0.012217 0.000122 0.001059 3.766781e-04 ... 0.0
V2188 0.011878 0.000122 0.056294 0.001452 0.000000e+00 ... 0.0
V2191 0.006844 0.001059 0.001452 0.062045 1.533827e-03 ... 0.0
V2155 0.001333 0.000377 0.000000 0.001534 2.150288e-02 ... 0.0
V2128 0.000125 0.000111 0.000123 0.000116 1.529010e-05 ... 0.0
V2105 0.025624 0.002345 0.025587 0.022655 1.138207e-02 ... 0.0
V2175 0.004807 0.000904 0.000683 0.003781 1.665315e-03 ... 0.0
V2185 0.000664 0.000151 0.000085 0.000612 4.174384e-04 ... 0.0
V2153 0.000943 0.000628 0.001532 0.001256 5.241064e-05 ... 0.0
V2194 0.005244 0.000568 0.002509 0.005804 8.418942e-04 ... 0.0
V2183 0.013637 0.001147 0.000961 0.004148 2.265506e-03 ... 0.0
V2143 0.000097 0.000022 0.000074 0.000103 2.758811e-05 ... 0.0
V2184 0.010678 0.000499 0.007488 0.007492 1.059992e-03 ... 0.0
V2460 0.000074 0.000127 0.000000 0.000045 2.408255e-05 ... 0.0
race 0.004662 0.001383 0.007512 0.002647 3.265106e-04 ... 0.0
V2907 0.000000 0.000000 0.000000 0.000000 0.000000e+00 ... 0.0
V2494 0.000000 0.000000 0.000000 0.000000 0.000000e+00 ... 0.0
RESPONDENT 0.006159 0.000354 0.002969 0.002256 7.573764e-04 ... 0.0
V2164 0.010029 0.003265 0.003100 0.009492 1.225573e-03 ... 0.0
V2146 0.000000 0.000000 0.000000 0.000000 0.000000e+00 ... 0.0
V49 0.009347 0.001981 0.006958 0.002406 1.532764e-03 ... 0.0
V2182 0.004331 0.000264 0.000793 0.003398 1.252161e-03 ... 0.0
V13 0.004877 0.004062 0.005611 0.011941 5.255729e-04 ... 0.0
V2152 0.013849 0.001526 0.001883 0.016666 5.542620e-03 ... 0.0
V2176 0.005450 0.007375 0.001302 0.006005 1.173518e-03 ... 0.0
V2196 0.005582 0.003381 0.003380 0.011326 3.780921e-03 ... 0.0
V2187 0.001510 0.000596 0.000147 0.001088 1.814104e-03 ... 0.0
V2173 0.005793 0.001092 0.001874 0.005735 1.039251e-03 ... 0.0
V2108 0.000918 0.000370 0.000438 0.000791 1.470156e-04 ... 0.0
V2033 0.000000 0.000000 0.000000 0.000000 0.000000e+00 ... 0.0
V2177 0.012957 0.000745 0.002374 0.005384 3.641342e-03 ... 0.0
V2030 0.000000 0.000000 0.000000 0.000000 0.000000e+00 ... 0.0
V2171 0.000028 0.000005 0.000149 0.000250 7.811983e-05 ... 0.0
V2119 0.000000 0.000000 0.000000 0.000000 0.000000e+00 ... 0.0
V2908 0.000000 0.000000 0.000000 0.000000 0.000000e+00 ... 0.0
V2195 0.003358 0.000906 0.001512 0.003992 2.270690e-03 ... 0.0
V2116 0.015932 0.002656 0.007450 0.017679 1.040673e-03 ... 0.0
V2180 0.000963 0.001108 0.001647 0.003240 4.879539e-04 ... 0.0
V2186 0.000373 0.000163 0.000411 0.001692 6.472437e-04 ... 0.0
V2166 0.026191 0.001771 0.001285 0.008454 1.530930e-03 ... 0.0
V2140 0.000014 0.000007 0.000003 0.000007 1.202832e-05 ... 0.0
V2156 0.001183 0.000168 0.000033 0.000645 4.001194e-04 ... 0.0
V2189 0.002986 0.000187 0.000415 0.002361 8.053417e-04 ... 0.0
V2201 0.001523 0.000225 0.001064 0.001286 1.715401e-04 ... 0.0
V2169 0.000000 0.000000 0.000000 0.000000 0.000000e+00 ... 0.0
V2122 0.000000 0.000000 0.000000 0.000000 0.000000e+00 ... 0.0
missing 0.004868 0.000800 0.001602 0.002177 5.546067e-04 ... 0.0
sex 0.003147 0.000483 0.006887 0.001303 1.305297e-03 ... 0.0
V2125 0.000083 0.000073 0.000033 0.000060 1.834481e-05 ... 0.0
V2179 0.016220 0.002975 0.002036 0.009874 3.068118e-03 ... 0.0
V2193 0.006381 0.000423 0.001087 0.005248 1.135447e-03 ... 0.0
V2101 0.004093 0.001623 0.001868 0.002238 1.077280e-03 ... 0.0
wave 0.041002 0.030261 0.012158 0.042805 1.236974e-02 ... 0.0
V2157 0.014435 0.000607 0.000000 0.003810 6.681082e-04 ... 0.0
V2122 missing sex V2125 V2179 V2193 \
V2137 0.0 0.000130 0.000049 6.945713e-06 0.000082 0.000141
V2172 0.0 0.001753 0.002203 1.202531e-04 0.005191 0.001559
V2181 0.0 0.001000 0.003122 7.846045e-05 0.001297 0.001805
V2178 0.0 0.001227 0.001071 4.618155e-05 0.006686 0.006972
V2134 0.0 0.000144 0.000022 4.987354e-06 0.000242 0.000085
V2163 0.0 0.004868 0.003147 8.287642e-05 0.016220 0.006381
V2197 0.0 0.000800 0.000483 7.336517e-05 0.002975 0.000423
V2188 0.0 0.001602 0.006887 3.296022e-05 0.002036 0.001087
V2191 0.0 0.002177 0.001303 5.978814e-05 0.009874 0.005248
V2155 0.0 0.000555 0.001305 1.834481e-05 0.003068 0.001135
V2128 0.0 0.000166 0.000235 5.278031e-05 0.000189 0.000052
V2105 0.0 0.011739 0.033367 1.155057e-04 0.063313 0.010649
V2175 0.0 0.002553 0.004833 9.044859e-05 0.015161 0.006784
V2185 0.0 0.000344 0.000066 4.023579e-06 0.001705 0.000874
V2153 0.0 0.001123 0.000627 2.541190e-05 0.002273 0.001496
V2194 0.0 0.002683 0.010773 2.117903e-04 0.008950 0.002081
V2183 0.0 0.004035 0.001675 9.323043e-05 0.018317 0.007331
V2143 0.0 0.000110 0.000154 1.070003e-05 0.000069 0.000230
V2184 0.0 0.002421 0.002722 1.764218e-04 0.003710 0.001853
V2460 0.0 0.000169 0.000033 3.036354e-06 0.000145 0.000070
race 0.0 0.002807 0.011958 1.800773e-04 0.005492 0.005691
V2907 0.0 0.000000 0.000000 0.000000e+00 0.000000 0.000000
V2494 0.0 0.000000 0.000000 0.000000e+00 0.000000 0.000000
RESPONDENT 0.0 0.001436 0.000693 3.752440e-06 0.004736 0.000632
V2164 0.0 0.003845 0.003471 1.827520e-04 0.014087 0.025429
V2146 0.0 0.000000 0.000000 0.000000e+00 0.000000 0.000000
V49 0.0 0.002933 0.005475 8.508893e-05 0.007755 0.004193
V2182 0.0 0.003147 0.002015 7.989134e-05 0.007934 0.002508
V13 0.0 0.002522 0.004142 3.941463e-05 0.015365 0.003531
V2152 0.0 0.003885 0.018970 8.081511e-05 0.017696 0.005216
V2176 0.0 0.002552 0.002989 2.218161e-05 0.007453 0.008351
V2196 0.0 0.002644 0.006379 1.216230e-04 0.012311 0.015221
V2187 0.0 0.000481 0.001689 8.803605e-06 0.002079 0.000689
V2173 0.0 0.003842 0.009092 9.267340e-05 0.016540 0.006551
V2108 0.0 0.000744 0.000395 7.392260e-05 0.001142 0.001002
V2033 0.0 0.000000 0.000000 0.000000e+00 0.000000 0.000000
V2177 0.0 0.003019 0.001743 2.240288e-04 0.012049 0.005962
V2030 0.0 0.000000 0.000000 0.000000e+00 0.000000 0.000000
V2171 0.0 0.000151 0.000010 6.784830e-08 0.000110 0.000179
V2119 0.0 0.000000 0.000000 0.000000e+00 0.000000 0.000000
V2908 0.0 0.000000 0.000000 0.000000e+00 0.000000 0.000000
V2195 0.0 0.001795 0.003817 8.267769e-05 0.005848 0.004778
V2116 0.0 0.007601 0.013019 4.775995e-04 0.017196 0.006381
V2180 0.0 0.002693 0.002132 2.964032e-05 0.007085 0.001735
V2186 0.0 0.000395 0.000938 2.823976e-06 0.001776 0.000801
V2166 0.0 0.006739 0.005281 8.856284e-05 0.011620 0.006439
V2140 0.0 0.000137 0.000004 4.850142e-06 0.000063 0.000022
V2156 0.0 0.000538 0.000104 9.452533e-06 0.000524 0.000160
V2189 0.0 0.002018 0.002574 2.279768e-05 0.001818 0.002545
V2201 0.0 0.000851 0.000582 1.916033e-05 0.005652 0.001051
V2169 0.0 0.000000 0.000000 0.000000e+00 0.000000 0.000000
V2122 0.0 0.000000 0.000000 0.000000e+00 0.000000 0.000000
missing 0.0 0.063807 0.002482 2.820202e-04 0.004974 0.002075
sex 0.0 0.002482 0.222048 1.229450e-04 0.012901 0.003316
V2125 0.0 0.000282 0.000123 2.467815e-03 0.000092 0.000045
V2179 0.0 0.004974 0.012901 9.169292e-05 0.113295 0.005443
V2193 0.0 0.002075 0.003316 4.488613e-05 0.005443 0.116065
V2101 0.0 0.001910 0.001581 4.439017e-05 0.005998 0.000927
wave 0.0 0.033823 0.063628 1.027013e-03 0.120772 0.042560
V2157 0.0 0.003805 0.001879 1.753067e-05 0.009146 0.003135
V2101 wave V2157
V2137 0.000070 0.001442 9.000518e-05
V2172 0.000516 0.022204 1.162857e-02
V2181 0.000538 0.012419 1.452850e-04
V2178 0.000440 0.024816 5.143092e-04
V2134 0.000067 0.000928 4.520196e-05
V2163 0.004093 0.041002 1.443499e-02
V2197 0.001623 0.030261 6.065777e-04
V2188 0.001868 0.012158 0.000000e+00
V2191 0.002238 0.042805 3.810327e-03
V2155 0.001077 0.012370 6.681082e-04
V2128 0.000178 0.004767 5.427367e-05
V2105 0.010748 1.136753 1.513988e-02
V2175 0.001713 0.051602 3.533106e-03
V2185 0.000156 0.004576 3.571820e-04
V2153 0.000362 0.007102 2.383781e-03
V2194 0.002464 0.104285 2.547308e-03
V2183 0.003693 0.070119 3.732516e-03
V2143 0.000101 0.001867 5.224842e-05
V2184 0.003229 0.064718 3.160546e-03
V2460 0.000047 0.000748 1.694982e-04
race 0.004120 0.323612 3.675930e-03
V2907 0.000000 0.000000 0.000000e+00
V2494 0.000000 0.000000 0.000000e+00
RESPONDENT 0.002167 0.028003 2.799535e-04
V2164 0.002443 0.066317 4.266689e-03
V2146 0.000000 0.000000 0.000000e+00
V49 0.002219 0.023401 1.026972e-02
V2182 0.001238 0.021054 2.189647e-03
V13 0.003105 0.058597 3.486217e-03
V2152 0.003383 0.099763 8.783717e-03
V2176 0.001735 0.096700 1.425147e-02
V2196 0.004824 0.218511 4.931574e-03
V2187 0.000373 0.004686 2.159413e-04
V2173 0.004344 0.092705 4.704848e-03
V2108 0.001348 0.089846 1.385032e-03
V2033 0.000000 0.000000 0.000000e+00
V2177 0.004121 0.077873 9.695057e-03
V2030 0.000000 0.000000 0.000000e+00
V2171 0.000040 0.000628 2.726120e-04
V2119 0.000000 0.000000 0.000000e+00
V2908 0.000000 0.000000 0.000000e+00
V2195 0.000828 0.042655 1.049865e-03
V2116 0.011746 1.455743 1.703481e-02
V2180 0.001427 0.020008 2.303820e-03
V2186 0.000076 0.008708 9.195180e-04
V2166 0.004499 0.068395 3.037345e-02
V2140 0.000010 0.000188 6.438142e-07
V2156 0.000282 0.003137 6.100717e-04
V2189 0.000370 0.010539 1.024734e-03
V2201 0.000502 0.038261 1.902385e-03
V2169 0.000000 0.000000 0.000000e+00
V2122 0.000000 0.000000 0.000000e+00
missing 0.001910 0.033823 3.805279e-03
sex 0.001581 0.063628 1.879282e-03
V2125 0.000044 0.001027 1.753067e-05
V2179 0.005998 0.120772 9.146010e-03
V2193 0.000927 0.042560 3.134862e-03
V2101 0.103005 0.317949 6.550920e-04
wave 0.317949 5.987740 1.436741e-02
V2157 0.000655 0.014367 1.050268e-01
[60 rows x 60 columns]
In [ ]:
# Step 1: Store interactions and their values, avoiding self-interactions AND duplicates
interaction_results = []
for i, feature_i in enumerate(original_categorical_features):
# Only loop j from i+1 to end, ensuring j > i
for j in range(i + 1, len(original_categorical_features)):
feature_j = original_categorical_features[j]
# Get indices for feature_i and feature_j
indices_i = [
feature_names.tolist().index(col)
for col in feature_mapping[feature_i]
if col in feature_names.tolist()
]
indices_j = [
feature_names.tolist().index(col)
for col in feature_mapping[feature_j]
if col in feature_names.tolist()
]
if not indices_i or not indices_j:
continue
# Compute interaction value
value = np.sum(np.abs(interaction_values[:, indices_i, :][:, :, indices_j]))
interaction_results.append(((feature_i, feature_j), value))
# Step 2: Sort the interactions by their absolute value (descending)
sorted_interactions = sorted(interaction_results, key=lambda x: x[1], reverse=True)
# Step 3: Select the top 30 interactions
top_30_interactions = sorted_interactions[:30]
# Step 4: Display the results
print("Top 30 Feature Interactions (Excluding Self-Interactions & Duplicates):")
for (feature_pair, interaction_value) in top_30_interactions:
print(f"Interaction ({feature_pair[0]}, {feature_pair[1]}): {interaction_value}")
# Step 5 (Optional): Visualize the top 30 interactions
import matplotlib.pyplot as plt
# Extract feature pairs and their values
feature_pairs = [f"{pair[0]} & {pair[1]}" for pair, _ in top_30_interactions]
values = [value for _, value in top_30_interactions]
# Create a bar plot
plt.figure(figsize=(12, 8))
plt.barh(feature_pairs, values, color='skyblue')
plt.xlabel('Interaction Value')
plt.ylabel('Feature Pairs')
plt.title('Top 30 Feature Interactions (Excluding Duplicates)')
plt.gca().invert_yaxis() # Invert y-axis for better readability
plt.tight_layout()
plt.show()
Top 30 Feature Interactions (Excluding Self-Interactions & Duplicates): Interaction (V2116, wave): 1.455742984648874 Interaction (V2105, wave): 1.136753219294689 Interaction (race, wave): 0.32361201148137897 Interaction (V2101, wave): 0.31794924169861044 Interaction (V2196, wave): 0.21851062034184396 Interaction (V2179, wave): 0.12077151708295893 Interaction (V2194, wave): 0.10428466907697606 Interaction (V2152, wave): 0.09976265557869676 Interaction (V2176, wave): 0.09669982811555715 Interaction (V2173, wave): 0.09270483934866512 Interaction (V2108, wave): 0.08984579836600001 Interaction (V2177, wave): 0.07787268823469257 Interaction (V2183, wave): 0.07011925071180561 Interaction (V2105, V13): 0.0697534793138348 Interaction (V2166, wave): 0.06839478060713346 Interaction (V2164, wave): 0.06631673096471599 Interaction (V2184, wave): 0.06471843982411986 Interaction (sex, wave): 0.06362835383535415 Interaction (V2105, V2179): 0.06331328790635543 Interaction (V13, wave): 0.05859665738852144 Interaction (V2105, V2116): 0.05634949443254901 Interaction (V2105, V2152): 0.05585307824196819 Interaction (V2175, wave): 0.05160217708296773 Interaction (V2105, V2196): 0.04333985201715526 Interaction (V2191, wave): 0.04280474158118715 Interaction (V2195, wave): 0.0426549832333175 Interaction (V2193, wave): 0.04255965855948828 Interaction (V2176, V2116): 0.041027333765607917 Interaction (V2163, wave): 0.04100182534246335 Interaction (V2201, wave): 0.03826080205088958